In [1]:
import pandas as pd
import ast
import copy
import gensim
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split


In [2]:
# preparing data function

def preprocessing(df, map, preprocess_y = True):
    X_df = pd.DataFrame({"Name": [ast.literal_eval(i) for i in df["Name"].values.tolist()], "Descr": [ast.literal_eval(i) for i in df["Descr"].values.tolist()]}) 


    # concatenating "Name" and "Descr" columns

    concat = []
    for j, i in zip(X_df ["Name"].values, X_df ["Descr"].values):
        new = copy.deepcopy(j)
        for k in i:
            new.append(copy.deepcopy(k))
        concat.append(new)
        
    X_df ["Concat"] = concat

    X = X_df["Concat"]
    
    if preprocess_y == True:
        
        y_df = df["Target"].map(map)
        y = y_df.values
        return X, y
    
    else:
        
        return X

In [3]:
# model definition

class Model():
    def __init__(self, embeding_vector_size = 100, skipg_window = 3, min_count = 2, use_sg = 1):
        
        # entry vectors
        
        self.X = "nan"
        self.y = "nan"
        
        # last prediction
        
        self.y_pred = "nan"
        
        # expanded categories vectors 
        
        self.X_sub = "nan"
        self.y_sub = "nan"
        
        # last embeded sentences vector
        
        self.X_vect_avg = []
        
        # created subcategories visualization
        
        self.subcategories_df = "nan"
        
        # number of clusters to generate per class "K"
        
        self.n_clusters = "nan"
        
        # models 
        
        self.w2v_model = "nan"
        self.kmeanModel = "nan"
        self.svc = "nan"
        
        # w2v hyperparameters
        
        self.embeding_vector_size = embeding_vector_size
        self.skipg_window = skipg_window
        self.min_count = min_count
        self.use_sg = use_sg
        
    # cluster mapping function
    
    def __clusterize(self, classes_df, labels):
        new_classes_dic = {}
        for i in classes_df:
            class_labels = labels[i][self.n_clusters[i]]
            new_class_labels = []
            for j in class_labels:
                new_class_labels.append(i + " " + str(j))
            new_classes_dic[i] = pd.DataFrame({"Target": new_class_labels, "sentence": classes_df[i]["sentence"].values.tolist()})
        new_classes_dic = pd.concat(new_classes_dic.values(), ignore_index=True)
        
        self.new_map = {np.unique(new_classes_dic["Target"].values)[i]:i  for i in range(len(np.unique(new_classes_dic["Target"].values)))}
        self.y_sub = new_classes_dic["Target"].map(self.new_map)
        self.X_sub = new_classes_dic["sentence"].values.tolist()
        self.subcategories_df = pd.DataFrame({"Target key": self.y_sub,"Target": new_classes_dic["Target"], "sentence": self.X_sub})
        
    # prediction mapping function

    def __map_prediction(self, y_pred, original_map):
        n_map = {}
        for i in self.new_map:
            for j in original_map:
                if(j == i[:len(j)]):
                    n_map[self.new_map[i]] = original_map[j]
                    break
        return pd.Series(y_pred).map(n_map).values
    
    
    # model training function
    
    def fit(self, X, y, n_clusters):
        
        self.X = X
        self.y = y
        self.n_clusters = n_clusters
        
        # training w2v
        
        self.w2v_model = gensim.models.Word2Vec(sentences = self.X,
                                   vector_size=self.embeding_vector_size,
                                   window=self.skipg_window,
                                   min_count=self.min_count,
                                   sg=self.use_sg)
        
        # arranging sentences with embeded words
        
        words = set(self.w2v_model.wv.index_to_key )
        X_vect = np.array([np.array([self.w2v_model.wv[i] for i in ls if i in words]) for ls in X])

        # averaging words into sentence

        
        for v in X_vect:
            if v.size:
                self.X_vect_avg.append(v.mean(axis=0))
            else:
                self.X_vect_avg.append(np.zeros(vector_size, dtype=float))
                
        # dividing the different categories to prepare them for clustering
                
        df = pd.DataFrame({"Target": self.y, "sentence": self.X_vect_avg})
 
        classes = {i: df[df.Target == map[i]] for i in map}
        for i in classes:
            classes[i] = classes[i].reset_index().drop(columns=["index"])
        
        # clustering from k = 1 to k = 12 to match the given map
        
        labels = {}

        for i in classes:
            i_labels = {}
            K = range(1,12)
            for k in K:
                self.kmeanModel = KMeans(n_clusters=k, algorithm="elkan").fit(classes[i]['sentence'].values.tolist())
                i_labels[k] = self.kmeanModel.labels_
            labels[i] = i_labels
            
        # arranging and selecting clustering results
        
        self.__clusterize(classes, labels)
        
        # training SVC
        
        self.svc = SVC(kernel = "linear").fit(self.X_sub, self.y_sub)
        
    def predict(self, X, original_map):
        
        # arranging sentences with embeded words
        
        words = set(self.w2v_model.wv.index_to_key)
        X_vect = np.array([np.array([self.w2v_model.wv[i] for i in ls if i in words]) for ls in X])

        # averaging words into sentence

        self.X_vect_avg = []
        
        for v in X_vect:
            if v.size:
                self.X_vect_avg.append(v.mean(axis=0))
            else:
                self.X_vect_avg.append(np.zeros(self.embeding_vector_size, dtype=float))

        
        # SVC prediction
        
        self.svc.prediction = self.svc.predict(self.X_vect_avg)
        
        # mapping back and returning prediction values
        
        return self.__map_prediction(self.svc.prediction, original_map)

In [4]:
# data download

df_clean = pd.read_csv("../df_clean-2.csv").drop(columns="index")

# separating known labeled entries

df_data = df_clean[df_clean.Target != "UNKNOWN"]
df_pred = df_clean[df_clean.Target == "UNKNOWN"]
df_data = df_data.reset_index().drop(columns=["index"])
df_pred = df_pred.reset_index().drop(columns=["index"])

# numerical categories map

map = {
"Program":        0,
"Display":        1,
"BTB":            2,
"Search":         3,
"Holiday":        4,
"BTS":            5,
"Email":          6,
"Digital":        7,
"Trad_media":     8
}

# data separation and formating

X, y = preprocessing(df_data, map)

# train test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)


In [5]:
# model instance

clf = Model()

# number of subcategories per category

n_clusters = {
"Program":        10,
"Display":        5,
"BTB":            3,
"Search":         4,
"Holiday":        5,
"BTS":            5,
"Email":          2,
"Digital":        1,
"Trad_media":     1
}

# model training

clf.fit(X_train ,y_train, n_clusters)

# model prediction

y_pred = clf.predict(X_test, map)


  X_vect = np.array([np.array([self.w2v_model.wv[i] for i in ls if i in words]) for ls in X])
  X_vect = np.array([np.array([self.w2v_model.wv[i] for i in ls if i in words]) for ls in X])


In [6]:
clf.subcategories_df

Unnamed: 0,Target key,Target,sentence
0,28,Program 7,"[-0.2096552, 0.55004996, 0.16968152, -0.135076..."
1,24,Program 3,"[-0.1335332, 0.030811075, 0.26544228, 0.192974..."
2,27,Program 6,"[0.09190765, 0.0002664019, -0.25416884, 0.2631..."
3,25,Program 4,"[-0.23053497, -0.40226638, -0.03286588, -0.087..."
4,23,Program 2,"[0.035273463, 0.22589871, 0.24653375, 0.339019..."
...,...,...,...
8882,35,Trad_media 0,"[-0.09336858, 0.27951446, 0.058978777, -0.0878..."
8883,35,Trad_media 0,"[0.022476397, 0.08743594, 0.09709846, -0.19506..."
8884,35,Trad_media 0,"[-0.09336858, 0.27951446, 0.058978777, -0.0878..."
8885,35,Trad_media 0,"[-0.0849318, 0.24255344, 0.054652426, -0.07675..."


In [7]:
accuracy_score(y_test, y_pred)

0.9446506918663516

In [8]:
# predict UNKNOWN data
y_unknown_predictions = clf.predict(preprocessing(df_data, map, False), map)

  X_vect = np.array([np.array([self.w2v_model.wv[i] for i in ls if i in words]) for ls in X])


In [9]:
# to csv
pd.DataFrame({"Tokens": preprocessing(df_data, map, False),"Predictions" : pd.Series(y_unknown_predictions).map({v: k for k, v in map.items()}).values}).to_csv("predicciones.csv")