In [181]:
import pandas as pd
from sklearn.model_selection import train_test_split
from  wordcloud import wordcloud, STOPWORDS
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [182]:
data= pd.read_table('Downloads/LabelledData.txt', header=None, sep=',,,')

  """Entry point for launching an IPython kernel.


In [183]:
data.rename({0:'Message',1:'Label'}, axis=1, inplace=True)

In [184]:
data

Unnamed: 0,Message,Label
0,how did serfdom develop in and then leave russ...,unknown
1,what films featured the character popeye doyle ?,what
2,how can i find a list of celebrities ' real na...,unknown
3,what fowl grabs the spotlight after the chines...,what
4,what is the full form of .com ?,what
...,...,...
1478,can it be cut to fit ?,affirmation
1479,can it be removed ?,affirmation
1480,does this hose have one ?,affirmation
1481,can i get it in india ?,affirmation


In [185]:
data.Label=data.Label.apply(lambda x:x.strip())

In [186]:
data.Label.value_counts()

what           609
who            402
unknown        272
affirmation    104
when            96
Name: Label, dtype: int64

### Multi-class label clasiification

In [187]:
data['Label'] = data.Label.map({'what':0, 'who':1,'when':2, 'affirmation':3 , 'unknown':4})

### 'what':0, 'who':1,'when':2, 'affirmation':3 , 'unknown':4

In [188]:
features = data.Message
target= data.Label

In [189]:
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.3, random_state=1)
print(features_train.shape)
print(features_test.shape)
print(target_test.shape)
print(target_train.shape)

(1038,)
(445,)
(445,)
(1038,)


### Pre-processing data

In [190]:
def pre_process_text(x):
    x = x.strip()
    x = re.sub(r' +', ' ', x)
    x = re.sub(r"[-()\"#/@;:{}`+=~|.!?,'0-9]", "", x)
    return(x)

In [191]:
data

Unnamed: 0,Message,Label
0,how did serfdom develop in and then leave russ...,4
1,what films featured the character popeye doyle ?,0
2,how can i find a list of celebrities ' real na...,4
3,what fowl grabs the spotlight after the chines...,0
4,what is the full form of .com ?,0
...,...,...
1478,can it be cut to fit ?,3
1479,can it be removed ?,3
1480,does this hose have one ?,3
1481,can i get it in india ?,3


### Tf-idf vectorization

In [192]:
vect = TfidfVectorizer(analyzer='word',lowercase=True, preprocessor=pre_process_text,max_df=1.0, min_df=10,  ngram_range=(1,2), stop_words=STOPWORDS)

In [193]:
vect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=10, ngram_range=(1, 2), norm='l2',
                preprocessor=<function pre_process_text at 0x0000027BEA824828>,
                smooth_idf=True,
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'all', 'also', 'am', 'an', 'and', 'any', 'are',
                            "aren't", 'as', 'at', 'be', 'because', 'been',
                            'before', 'being', 'below', 'between', 'both',
                            'but', 'by', 'can', "can't", 'cannot', 'com', ...},
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabulary=None)

In [194]:
vect.fit(features_train)
features_train_dtm = vect.transform(features_train)
features_train_dtm

  'stop_words.' % sorted(inconsistent))


<1038x36 sparse matrix of type '<class 'numpy.float64'>'
	with 590 stored elements in Compressed Sparse Row format>

### Examining the tokens and their counts

In [195]:
features_train_tokens = vect.get_feature_names()
print(features_train_tokens[:25])
print("=======================================================================================================================")
print(features_train_tokens[-25:])

['american', 'called', 'city', 'company', 'country', 'film', 'find', 'first', 'game', 'invented', 'john', 'known', 'live', 'made', 'man', 'many', 'mean', 'movie', 'name', 'new', 'people', 'played', 'president', 'star', 'state']
['known', 'live', 'made', 'man', 'many', 'mean', 'movie', 'name', 'new', 'people', 'played', 'president', 'star', 'state', 'time', 'two', 'us', 'used', 'war', 'will', 'word', 'work', 'world', 'wrote', 'year']


In [196]:
features_train_dtm.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [197]:
dtm = pd.DataFrame(features_train_dtm.toarray(), columns=vect.get_feature_names())

In [198]:
dtm

Unnamed: 0,american,called,city,company,country,film,find,first,game,invented,...,two,us,used,war,will,word,work,world,wrote,year
0,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.759224,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1033,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1034,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.799949,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1035,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1036,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [199]:
dtm.shape

(1038, 36)

In [200]:
target_train.shape

(1038,)

### Naive Bayes model

In [201]:
nb = MultinomialNB()
nb.fit(dtm, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [202]:
features_test_dtm = vect.transform(features_test)

In [203]:
target_pred_class_train = nb.predict(dtm)
target_pred_class_test = nb.predict(features_test_dtm)

In [204]:
print(metrics.accuracy_score(target_test, target_pred_class_test))

0.5528089887640449


In [205]:
print(metrics.confusion_matrix(target_test, target_pred_class_test))

[[190  10   1   0   0]
 [ 84  24   0   1   0]
 [ 20   3   5   1   0]
 [ 21   1   0   5   0]
 [ 51   2   0   4  22]]


In [206]:
print(metrics.classification_report(target_train, target_pred_class_train))

              precision    recall  f1-score   support

           0       0.47      0.93      0.63       408
           1       0.68      0.31      0.43       293
           2       0.71      0.07      0.14        67
           3       0.70      0.27      0.39        77
           4       0.98      0.30      0.46       193

    accuracy                           0.54      1038
   macro avg       0.71      0.38      0.41      1038
weighted avg       0.66      0.54      0.49      1038



In [207]:
print(metrics.classification_report(target_test, target_pred_class_test))

              precision    recall  f1-score   support

           0       0.52      0.95      0.67       201
           1       0.60      0.22      0.32       109
           2       0.83      0.17      0.29        29
           3       0.45      0.19      0.26        27
           4       1.00      0.28      0.44        79

    accuracy                           0.55       445
   macro avg       0.68      0.36      0.40       445
weighted avg       0.64      0.55      0.49       445



### KNN classifier

In [208]:
param_grid = {'n_neighbors': [4,5,6,7,8,9], 'weights': ['distance', 'uniform']}

In [209]:
model = GridSearchCV(KNeighborsClassifier(), param_grid, cv=10, scoring = 'accuracy')
model = model.fit(dtm, target_train)

In [210]:
model.best_params_

{'n_neighbors': 8, 'weights': 'uniform'}

In [211]:
features_test_dtm = vect.transform(features_test)

In [212]:
features_test_dtm

<445x36 sparse matrix of type '<class 'numpy.float64'>'
	with 230 stored elements in Compressed Sparse Row format>

In [213]:
target_pred_class_train = model.predict(dtm)
target_pred_class_test = model.predict(features_test_dtm.toarray())

In [214]:
print(metrics.accuracy_score(target_test, target_pred_class_test))

0.4404494382022472


In [215]:
print(metrics.confusion_matrix(target_test, target_pred_class_test))

[[ 67 126   1   0   7]
 [ 16  92   0   1   0]
 [  5  16   6   1   1]
 [  2  21   0   4   0]
 [  6  42   0   4  27]]


In [216]:
print(metrics.classification_report(target_train, target_pred_class_train))

              precision    recall  f1-score   support

           0       0.56      0.36      0.44       408
           1       0.35      0.78      0.48       293
           2       0.50      0.07      0.13        67
           3       0.72      0.17      0.27        77
           4       0.76      0.34      0.47       193

    accuracy                           0.44      1038
   macro avg       0.58      0.34      0.36      1038
weighted avg       0.54      0.44      0.42      1038



In [217]:
print(metrics.classification_report(target_test, target_pred_class_test))

              precision    recall  f1-score   support

           0       0.70      0.33      0.45       201
           1       0.31      0.84      0.45       109
           2       0.86      0.21      0.33        29
           3       0.40      0.15      0.22        27
           4       0.77      0.34      0.47        79

    accuracy                           0.44       445
   macro avg       0.61      0.37      0.39       445
weighted avg       0.61      0.44      0.43       445



### Logistic regression

In [218]:
logreg = LogisticRegression()
logreg.fit(dtm, target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [219]:
target_pred_class_test = logreg.predict(features_test_dtm)
target_pred_class_train = logreg.predict(dtm)
target_pred_prob = logreg.predict_proba(features_test_dtm)[:, 1]

In [220]:
print(metrics.accuracy_score(target_train, target_pred_class_train))

0.5366088631984586


In [221]:
print(metrics.classification_report(target_train, target_pred_class_train))
print(metrics.classification_report(target_test, target_pred_class_test))

              precision    recall  f1-score   support

           0       0.47      0.94      0.63       408
           1       0.68      0.31      0.43       293
           2       0.71      0.07      0.14        67
           3       0.70      0.27      0.39        77
           4       0.98      0.30      0.46       193

    accuracy                           0.54      1038
   macro avg       0.71      0.38      0.41      1038
weighted avg       0.66      0.54      0.49      1038

              precision    recall  f1-score   support

           0       0.52      0.95      0.67       201
           1       0.60      0.22      0.32       109
           2       0.83      0.17      0.29        29
           3       0.45      0.19      0.26        27
           4       1.00      0.28      0.44        79

    accuracy                           0.55       445
   macro avg       0.68      0.36      0.40       445
weighted avg       0.64      0.55      0.49       445

