#  SST Classification Using SVM

In [1]:
#importing libraries 
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline, FeatureUnion
from nltk.corpus import stopwords
import pickle

## Exploring the dataset

In [2]:
train_data = pd.read_csv('stsa.fine.train.converted.csv', sep=',')
train_data

Unnamed: 0,label_fine_grained,label_coarse_grained,sentence
0,4,1,"a stirring , funny and finally transporting re..."
1,1,-1,apparently reassembled from the cutting-room f...
2,1,-1,they presume their audience wo n't sit still f...
3,2,0,the entire movie is filled with deja vu moments .
4,3,1,this is a visually stunning rumination on love...
...,...,...,...
5855,4,1,but the talented cast alone will keep you watc...
5856,3,1,"earnest , unsubtle and hollywood-predictable ,..."
5857,4,1,jeffrey tambor 's performance as the intellige...
5858,3,1,an overly familiar scenario is made fresh by a...


In [3]:
#the number of messages in the dataset 
len(train_data)

5860

In [4]:
trainLabels_grained = train_data['label_fine_grained']
trainLabels_coarse = train_data['label_coarse_grained']

In [5]:
#number of values for fine grained labels
len(train_data['label_fine_grained'].unique())

5

In [6]:
#the distribution of messages for fine grained labels
trainLabels_grained.value_counts()

3    1607
1    1515
2    1117
4     860
0     761
Name: label_fine_grained, dtype: int64

In [7]:
#number of values for coarse grained labels
len(train_data['label_coarse_grained'].unique())

3

In [8]:
#the distribution of messages for coarse grained labels
trainLabels_coarse.value_counts()

 1    2467
-1    2276
 0    1117
Name: label_coarse_grained, dtype: int64

In [9]:
cnt_vectorizer_unigram = CountVectorizer(stop_words='english')
cnt_unigram_train_vector = cnt_vectorizer_unigram.fit_transform(train_data['sentence'])

In [10]:
#Calculating the average number of tokens per message
import numpy as np
tokens = cnt_unigram_train_vector.toarray()
n_tokens = 0
for i in range(len(tokens)): 
    
    count_arr = np.count_nonzero(tokens[i] == 1)
    n_tokens += count_arr 
avg_tokens = n_tokens / len(tokens)
avg_tokens

8.626279863481228

In [11]:
#loading the test dataset
test_data = pd.read_csv('stsa.fine.test.converted.csv', sep=',')
test_data

Unnamed: 0,label_fine_grained,label_coarse_grained,sentence
0,1,-1,"no movement , no yuks , not much of anything ."
1,0,-1,"a gob of drivel so sickly sweet , even the eag..."
2,2,0,` how many more voyages can this limping but d...
3,2,0,so relentlessly wholesome it made me want to s...
4,0,-1,"gangs of new york is an unapologetic mess , wh..."
...,...,...,...
2205,1,-1,the problem with concept films is that if the ...
2206,1,-1,"safe conduct , however ambitious and well-inte..."
2207,1,-1,"a film made with as little wit , interest , an..."
2208,2,0,to enjoy this movie 's sharp dialogue and deli...


In [12]:
#loading the dev dataset
dev_data = pd.read_csv('stsa.fine.dev.converted.csv', sep=',')
dev_data

Unnamed: 0,label_fine_grained,label_coarse_grained,sentence
0,2,0,"in his first stab at the form , jacquot takes ..."
1,1,-1,one long string of cliches .
2,1,-1,if you 've ever entertained the notion of doin...
3,0,-1,k-19 exploits our substantial collective fear ...
4,1,-1,it 's played in the most straight-faced fashio...
...,...,...,...
1096,2,0,`` the ring '' is pretty much an english-langu...
1097,3,1,"smart , provocative and blisteringly funny ."
1098,0,-1,"this one is definitely one to skip , even for ..."
1099,3,1,charles ' entertaining film chronicles seinfel...


## Developing the classifiers 

In [13]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
clf = SVC(random_state=42, kernel='linear')
trained_model = clf.fit(cnt_unigram_train_vector, trainLabels_grained)

In [14]:
devLabels_coarse = dev_data['label_coarse_grained']
testLabels_coarse = test_data['label_coarse_grained']

implementing FeatureUnion()to a Pipeline() using CountVectorizer 

In [15]:

vectorizer_union = FeatureUnion([('cnt_word', CountVectorizer(stop_words='english')),
                               ('cnt_char', CountVectorizer(analyzer='char', ngram_range=(1, 2)))
                               ])

svm_pipeline = Pipeline([
            ('vectorize', vectorizer_union),
            ('classify', SVC(random_state=42, kernel='linear'))
            ])

#training the model 
trained_model_feature_union_cnt = svm_pipeline.fit(train_data['sentence'], trainLabels_coarse)


In [16]:
prediction_dev_cnt = trained_model_feature_union_cnt.predict(dev_data['sentence'])
prediction_test_cnt = trained_model_feature_union_cnt.predict(test_data['sentence'])
print(prediction_dev_cnt,prediction_test_cnt)

[ 1 -1 -1 ...  1  0 -1] [-1 -1  1 ...  1  1  1]


In [17]:

print("Enhanced model - WORD UNIGRAM + CHAR BI-GRAM")
print(classification_report(devLabels_coarse, prediction_dev_cnt))

Enhanced model - WORD UNIGRAM + CHAR BI-GRAM
              precision    recall  f1-score   support

          -1       0.57      0.62      0.59       428
           0       0.26      0.21      0.23       229
           1       0.63      0.64      0.63       444

    accuracy                           0.54      1101
   macro avg       0.49      0.49      0.49      1101
weighted avg       0.53      0.54      0.53      1101



implementing FeatureUnion()to a Pipeline() using TF-IDF

In [18]:
tfidf_vectorizer_unigram = TfidfVectorizer(stop_words='english')
tfidf_unigram_train_vector = tfidf_vectorizer_unigram.fit_transform(train_data['sentence'])

In [19]:
vectorizer_union = FeatureUnion([('tfidf_word', TfidfVectorizer(stop_words='english')),
                               ('tfidf_char', TfidfVectorizer(analyzer='char', ngram_range=(1, 2)))
                               ])

svm_pipeline = Pipeline([
            ('vectorize', vectorizer_union),
            ('classify', SVC(random_state=42, kernel='linear'))
            ])

trained_model_feature_union_tfidf = svm_pipeline.fit(train_data['sentence'], trainLabels_coarse)


In [20]:
prediction_dev_tfidf = trained_model_feature_union_tfidf.predict(dev_data['sentence'])
prediction_test_tfidf = trained_model_feature_union_tfidf.predict(test_data['sentence'])
print(prediction_dev_tfidf,prediction_test_tfidf)

[ 1 -1 -1 ...  1  0 -1] [-1 -1  1 ... -1  1  1]


In [21]:
print("Enhanced model - WORD UNIGRAM + CHAR BI-GRAM")
print(classification_report(devLabels_coarse, prediction_dev_tfidf))

Enhanced model - WORD UNIGRAM + CHAR BI-GRAM
              precision    recall  f1-score   support

          -1       0.61      0.69      0.65       428
           0       0.31      0.13      0.18       229
           1       0.63      0.74      0.68       444

    accuracy                           0.59      1101
   macro avg       0.52      0.52      0.50      1101
weighted avg       0.55      0.59      0.56      1101



implementing FeatureUnion()to a Pipeline() using CountVictorizer and TF-IDF


In [22]:
vectorizer_union_mixed = FeatureUnion([('tfidf_word', TfidfVectorizer(stop_words='english')),
                               ('cnt_char', CountVectorizer(analyzer='char', ngram_range=(1, 2)))
                               ])

svm_pipeline = Pipeline([
            ('vectorize', vectorizer_union),
            ('classify', SVC(random_state=42, kernel='linear'))
            ])

trained_model_feature_union_mixed = svm_pipeline.fit(train_data['sentence'], trainLabels_coarse)


In [23]:
prediction_dev_mixed = trained_model_feature_union_mixed.predict(dev_data['sentence'])
prediction_test_mixed = trained_model_feature_union_mixed.predict(test_data['sentence'])
print(prediction_dev_mixed,prediction_test_mixed)

[ 1 -1 -1 ...  1  0 -1] [-1 -1  1 ... -1  1  1]


In [24]:
print("Enhanced model - WORD UNIGRAM + CHAR BI-GRAM")
print(classification_report(devLabels_coarse, prediction_dev_mixed))

Enhanced model - WORD UNIGRAM + CHAR BI-GRAM
              precision    recall  f1-score   support

          -1       0.61      0.69      0.65       428
           0       0.31      0.13      0.18       229
           1       0.63      0.74      0.68       444

    accuracy                           0.59      1101
   macro avg       0.52      0.52      0.50      1101
weighted avg       0.55      0.59      0.56      1101



TF-IDF model has the highest accuracy and therefore we will use it for the test data

In [25]:
print("Enhanced model - WORD UNIGRAM + CHAR BI-GRAM")
print(classification_report(testLabels_coarse, prediction_test_tfidf))

Enhanced model - WORD UNIGRAM + CHAR BI-GRAM
              precision    recall  f1-score   support

          -1       0.65      0.72      0.69       912
           0       0.26      0.10      0.15       389
           1       0.67      0.78      0.72       909

    accuracy                           0.64      2210
   macro avg       0.53      0.53      0.52      2210
weighted avg       0.59      0.64      0.61      2210



## Applying the best model to the phrases_dataset.tsv

In [26]:
#exploring the dataset 
import csv
dataset_dh = pd.read_csv('phrases_dataset.tsv', delimiter="\t", quoting=csv.QUOTE_NONE, header=None).dropna()
dataset_dh.columns =['sentence', 'score']
dataset_dh.head()


Unnamed: 0,sentence,score
0,"For Nik, he only wants to silence the cacophon...",0.0
1,"""I can play this two ways",0.0
2,"Mild, because it isn't conclusive, and doesn't...",-1.0
3,You can also get some more information about t...,0.0
4,"Soon, Hero, who has never had friends, is thru...",0.0


In [27]:
#applying the model 
prediction_test_dh_tfidf = trained_model_feature_union_tfidf.predict(dataset_dh['sentence'])
print(prediction_test_dh_tfidf)

[-1  0  0 -1 -1 -1 -1 -1 -1  0 -1  1 -1 -1  0  1  1 -1  1  1 -1 -1 -1  1
 -1 -1  1 -1  0 -1  1  1 -1 -1  1  1  1  1  1 -1 -1  1  1  1 -1  1  1  1
  1 -1 -1 -1 -1 -1  1  1  1  1  0 -1 -1 -1  1 -1 -1  1 -1 -1 -1  1  1  0
  1 -1 -1  1 -1 -1  1 -1  1 -1  1  1 -1 -1 -1 -1  0  1  0  1  1  1 -1  1
 -1 -1 -1 -1 -1  1 -1 -1  1  1 -1 -1 -1  0  1 -1  1  0  1  1 -1 -1 -1  1
  1  1  1  1  0  1 -1 -1  1 -1 -1 -1  1 -1 -1  1  1 -1  1  1 -1  1 -1 -1
 -1 -1  1  1  1  1  1 -1 -1 -1  1  1  1 -1 -1  0 -1  1 -1  1 -1 -1  1 -1
  1 -1  1 -1  1  1  1 -1 -1  1  1 -1 -1 -1  1  1 -1 -1 -1 -1  1  1 -1  1
 -1 -1  1 -1 -1 -1 -1 -1 -1  1 -1 -1  1  1 -1  1 -1 -1  1  0 -1 -1  1 -1
  1 -1  1  1 -1 -1  1  1  0 -1  1  0 -1  1 -1 -1  0  1  1 -1  1 -1  1 -1
 -1 -1  1 -1 -1  1  1 -1 -1  1 -1 -1 -1 -1  1 -1 -1  1 -1 -1  1 -1 -1 -1
 -1  0  1 -1 -1 -1  1 -1  1 -1  0  1  1  0 -1  0  1  0  1  1  1 -1  0 -1
 -1 -1  1 -1  1  1 -1  1  1 -1  0  1  1 -1 -1 -1 -1 -1  1 -1  1  1  1 -1
  1  1  1  1 -1 -1  1 -1  0 -1 -1  1 -1 -1  1 -1  1

In [28]:
print("Enhanced model - WORD UNIGRAM + CHAR BI-GRAM")
print(classification_report(dataset_dh['score'], prediction_test_dh_tfidf))

Enhanced model - WORD UNIGRAM + CHAR BI-GRAM
              precision    recall  f1-score   support

        -1.0       0.23      0.74      0.36        62
         0.0       0.50      0.10      0.17       146
         1.0       0.61      0.55      0.58       180

    accuracy                           0.41       388
   macro avg       0.45      0.46      0.37       388
weighted avg       0.51      0.41      0.39       388



## Saving the model

In [29]:
pickle.dump(trained_model_feature_union_tfidf, open("feature_unione.pickle", "wb"))