In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
os.chdir("Desktop")

In [3]:
#import the data file

df = pd.read_csv("file.csv")

In [4]:
#Set messages as the index to ease the preprocessing steps

df.index= df.message

In [5]:
#Remove message as a column to avoid duplicates
df.drop(["message"],1,inplace=True)

In [6]:
df.head()

Unnamed: 0_level_0,food,recharge,support,reminders,travel,nearby,movies,casual,other
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
7am everyday,F,F,F,T,F,F,F,F,F
chocolate cake,T,F,F,F,F,F,F,F,F
closed mortice and tenon joint door dimentions,F,F,T,F,F,F,F,F,F
train eppo kelambum,F,F,F,F,T,F,F,F,F
yesterday i have cancelled the flight ticket,F,F,F,F,T,F,F,F,F


In [None]:
#The goal of the preprocessing is to take the existing data and produce two columns
#column 1 has the message and column 2 has the class it corresponds to 

In [6]:
#Step 1: Convert all True tags into the name of the class they correspond to and store them as a list of series
series_list=[]
for name in df.columns:
    series_list.append(df[name].apply(lambda x: name if x=="T" else x))

In [7]:
#Now concatenate the list of series together to form a new dataframe
new_df= pd.concat(series_list, axis=1)

In [8]:
#Store the range of all the records/rows (messages we have)
my_range = np.arange(0, len(new_df))

In [10]:
#For each row store all the data as a list - with 0 for all "False" and the class name for all "True"
list_classes= []
for y in my_range:
    list_classes.append(list(new_df.iloc[y].apply(lambda x:x if x!="F" else 0)))

In [11]:
#Store the range of the list of classes
range_me= np.arange(0, len(list_classes))

In [12]:
#Remove all 0's from the list such that we are only left with the class name for each record
col= []
for y, w in zip(list_classes, my_range):
    col.append([x for x in list_classes[w] if x!=0])

In [13]:
#assign the class name as a new column in our new dataframe
new_df["class"] = col

In [14]:
#Select only the first class that this record belongs to
new_df['class']= new_df['class'].str.get(0)

In [15]:
new_df.head()

Unnamed: 0_level_0,food,recharge,support,reminders,travel,nearby,movies,casual,other,class
message,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7am everyday,F,F,F,reminders,F,F,F,F,F,reminders
chocolate cake,food,F,F,F,F,F,F,F,F,food
closed mortice and tenon joint door dimentions,F,F,support,F,F,F,F,F,F,support
train eppo kelambum,F,F,F,F,travel,F,F,F,F,travel
yesterday i have cancelled the flight ticket,F,F,F,F,travel,F,F,F,F,travel


In [17]:
#create a new column from the index 
new_df["message"]= new_df.index

In [18]:
df = new_df[["message","class"]]

In [19]:
df.head()

Unnamed: 0_level_0,message,class
message,Unnamed: 1_level_1,Unnamed: 2_level_1
7am everyday,7am everyday,reminders
chocolate cake,chocolate cake,food
closed mortice and tenon joint door dimentions,closed mortice and tenon joint door dimentions,support
train eppo kelambum,train eppo kelambum,travel
yesterday i have cancelled the flight ticket,yesterday i have cancelled the flight ticket,travel


In [20]:
df_range= np.arange(len(df))

In [21]:
df.index = df_range

In [None]:
#Now that all the basic data cleaning steps are complete, we need to apply NLP to vectorize the data

In [22]:
#First step towards text vectorization is text cleaning

import re
import nltk
from nltk.stem.wordnet import WordNetLemmatizer
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
wnl= WordNetLemmatizer()
#.lemmatize(text, pos='v'))

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    #Lemmetization
    lem_tokens = [wnl.lemmatize(word, pos="v") for word in filtered_tokens]
    # re-create document from filtered tokens
    doc = ' '.join(lem_tokens)
    return doc

In [23]:
normalize_corpus = np.vectorize(normalize_document)

In [24]:
text = df["message"]

In [25]:
norm_corpus = normalize_corpus(text)

In [26]:
#we will use a TFIDF vectorizer in this case
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
tf = TfidfVectorizer()

In [28]:
tf_vec= tf.fit_transform(norm_corpus)

In [29]:
vocab = tf.get_feature_names()

In [30]:
#Now let us set up our target variable
y = df["class"]

In [32]:
#Let us evaluate it for class imbalance
y.value_counts(normalize=True)

travel       0.272092
reminders    0.174795
casual       0.147446
food         0.097863
nearby       0.091148
other        0.070907
recharge     0.062151
movies       0.055978
support      0.027620
Name: class, dtype: float64

In [33]:
#Travel seems to have the highest proportion of messages
#Apart from this no major signs of class imbalance

In [34]:
from sklearn.model_selection import train_test_split as tts

In [35]:
X_train, X_test, y_train, y_test = tts(tf_vec, y, test_size=0.3, random_state=42)

In [36]:
from sklearn.naive_bayes import MultinomialNB

In [37]:
nb = MultinomialNB()

In [38]:
nb.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [39]:
y_pred_nb= nb.predict(X_test)

In [40]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [41]:
accuracy_score(y_test, y_pred_nb)

0.7124938514510576

In [43]:
confusion_matrix(y_test, y_pred_nb)

array([[1071,    7,    5,    4,   10,    5,   78,    0,  627],
       [  23,  588,    0,   41,    3,   20,   42,    1,  440],
       [  25,    1,  396,   26,    3,    0,   14,    0,  199],
       [  36,   41,    8,  623,   17,    2,   34,    3,  360],
       [  73,   14,    5,   33,  480,    8,   44,    0,  245],
       [  23,    4,    0,   10,    2,  406,   34,    0,  268],
       [  33,    4,    2,   11,    2,   14, 1882,    0,  175],
       [  42,    4,    0,   72,   16,    5,   34,   27,  157],
       [  32,    7,    1,   16,    0,   13,   29,    0, 3218]])

In [44]:
from sklearn.svm import SVC

In [45]:
svc = SVC(kernel="linear")

In [46]:
svc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [47]:
y_pred_svc= svc.predict(X_test)

In [48]:
accuracy_score(y_test, y_pred_svc)

0.7763567797999672

In [49]:
from sklearn.linear_model import LogisticRegression

In [50]:
logreg= LogisticRegression()

In [51]:
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [52]:
y_pred_logreg= logreg.predict(X_test)

In [53]:
accuracy_score(y_test, y_pred_logreg)

0.7714379406460076

In [54]:
from sklearn.ensemble import RandomForestClassifier

In [55]:
rf = RandomForestClassifier()

In [56]:
rf.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [57]:
y_pred_rf= rf.predict(X_test)

In [58]:
accuracy_score(y_test, y_pred_rf)

0.749795048368585

In [None]:
#So far logistic regression has the highest accuracy score. Let us see if we can improve it through Grid Search CV

In [59]:
# Create regularization penalty space for logistic regression
penalty = ['l1', 'l2']

# Create regularization hyperparameter space
C = np.logspace(0, 4, 10)

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

In [60]:
from sklearn.model_selection import GridSearchCV

In [61]:
clf = GridSearchCV(logreg, hyperparameters, cv=5, verbose=0)

In [62]:
clf.fit(X_train, y_train)



GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': array([1.00000e+00, 2.78256e+00, 7.74264e+00, 2.15443e+01, 5.99484e+01,
       1.66810e+02, 4.64159e+02, 1.29155e+03, 3.59381e+03, 1.00000e+04]), 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [None]:
print('Best Penalty:', clf.best_estimator_.get_params()['penalty'])
print('Best C:', clf.best_estimator_.get_params()['C'])

In [None]:
y_pred_gscv= clf.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_gscv)

In [70]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 400, num = 50)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}
print(random_grid)

{'n_estimators': [200, 204, 208, 212, 216, 220, 224, 228, 232, 236, 240, 244, 248, 253, 257, 261, 265, 269, 273, 277, 281, 285, 289, 293, 297, 302, 306, 310, 314, 318, 322, 326, 330, 334, 338, 342, 346, 351, 355, 359, 363, 367, 371, 375, 379, 383, 387, 391, 395, 400], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]}


In [71]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, 
                               random_state=42, n_jobs = -1)

In [73]:
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
y_pred_rf_random = rf_random.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_rf_random)

In [None]:
#Due to CPU limitations the above GridSearchCV operation was not able to be performed. 

In [None]:
#Concluding this model by selecting the Logistic regression algorithm which provided the highest accuracy. 