# Classification exercises


This is a modified, more compact version of the SVM text classification code from the classification notebook. You may wish to use this as a starting point for doing some of the exercises.

(This first piece of code just imports the necessary libraries.)

In [1]:
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix
from eli5 import show_weights

## Loading and splitting data

In [9]:
#Amazon
#### THIS ONE IS NOT QUITE FUNCTIONAL YET! Try the one below (i.e. Sentiment 140)

import pandas as pd

def class_counts(df, label='class'):
    return df[label].value_counts().to_string(header=None)

#separate class, id, etc information from each line
text_data = pd.read_csv('../data/reviews.csv', sep='\t', names=('class', 'text'))
text_data = text_data[['class', 'text']]   

print("Feature counts:")
print(class_counts(text_data))

#divide into train, devel and test sets
train_data, devel_and_test_data = train_test_split(text_data, test_size=0.4, random_state=1234)
devel_data, test_data = train_test_split(devel_and_test_data, test_size=0.5, random_state=5678)

train_Y, train_texts = train_data['class'], train_data['text']
devel_Y, devel_texts = devel_data['class'], devel_data['text']
test_Y, test_texts = test_data['class'], test_data['text']


print("\n", train_data[0:10])

Feature counts:
4    7901
0    1345

       class                                               text
962       0  Good, flattering fit.  I'm 5'2" and I bought a...
6911      4  I got mine with a elegant blue swoosh on the s...
1201      4  I ordered these shoes the minute I saw them fe...
5210      4  The ear warmers arrived on time and the kids l...
4882      4  Ouch, I'm so hot you'll burn if you touch me. ...
749       0  These were not as expected, there was no cap t...
3887      4  I bought these just before New Years and accid...
4847      4  This dress is a BARGAIN. I'm chaperoning prom ...
8210      4  I can't give you a professional review on thes...
4216      4  The belt clip is a bit fragile so you need to ...


In [2]:
#Sentiment 140

import pandas as pd

def class_counts(df, label='class'):
    return df[label].value_counts().to_string(header=None)

#separate class, id, etc information from each line
text_data = pd.read_csv('../data/sentiment140.csv', sep='\t', names=('class', 'id', 'time', 'query', 'user', 'tweet'))
text_data = text_data[['class', 'tweet']]   

print("Feature counts:")
print(class_counts(text_data))

#divide into train, devel and test sets
train_data, devel_and_test_data = train_test_split(text_data, test_size=0.4, random_state=1234)
devel_data, test_data = train_test_split(devel_and_test_data, test_size=0.5, random_state=5678)

train_Y, train_texts = train_data['class'], train_data['tweet']
devel_Y, devel_texts = devel_data['class'], devel_data['tweet']
test_Y, test_texts = test_data['class'], test_data['tweet']

print("\n", train_data[0:10])

Feature counts:
4    800000
0    800000

          class                                              tweet
1063187      4  i wish i had a balloon t-shirt. maybe i shall ...
7089         0  @ddlovato hey demi, wen are you and selena gon...
633572       0   i think the huge bug bite the size of a large...
1019991      4  slept in too late. lots to do, little time to ...
438271       0                          Re: chocos. Its all over 
14104        0  so we never went to softball. o well haha. han...
479775       0  @itshanni3  I felt sorry for the guy though be...
254598       0         @Virkus Duck!  this makes me miss my duck.
796035       0  My family are little shite-ers. Including Daddy. 
1114592      4  Goin back to work next week, so not looking fo...


## Featurization and vectorization

In [3]:
space_tokenizer = lambda text: text.split()

vectorizer = TfidfVectorizer(tokenizer=space_tokenizer, ngram_range=(1,2))
vectorizer.fit(train_texts)

train_X = vectorizer.transform(train_texts)
devel_X = vectorizer.transform(devel_texts)
test_X = vectorizer.transform(test_texts)

In [4]:
#Distribution of texts and classes in the dataset

print("Train:", len(train_texts))
print(class_counts(train_data), "\n")

print("Devel:",len(devel_texts))
print(class_counts(devel_data), "\n")

print("Test:",len(test_texts))
print(class_counts(test_data), "\n")

Train: 960000
4    480309
0    479691 

Devel: 320000
0    160171
4    159829 

Test: 320000
0    160138
4    159862 



## Training and prediction

In [5]:
%%time
classifier = LinearSVC(
    C=1.0,
    class_weight=None,
    max_iter=1000,
    loss='squared_hinge'
)
classifier.fit(train_X, train_Y)

CPU times: user 26.6 s, sys: 208 ms, total: 26.8 s
Wall time: 26.8 s


In [6]:
%%time
pred_Y = classifier.predict(devel_X)

CPU times: user 56 ms, sys: 0 ns, total: 56 ms
Wall time: 54.3 ms


## Evaluation and analysis

In [7]:
accuracy = accuracy_score(devel_Y, pred_Y) 
tn, fp, fn, tp = confusion_matrix(devel_Y, pred_Y, labels=[0, 4]).ravel()
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f_score = 2 * precision * recall / (precision + recall)


print('accuracy {:.2%}'.format(accuracy))
print('precision {:.2%}, recall {:.2%}, f-score {:.2%}'.format(precision, recall, f_score))

accuracy 81.92%
precision 82.41%, recall 81.13%, f-score 81.76%


In [8]:
show_weights(classifier, vec=vectorizer)

Weight?,Feature
+5.348,no problem
+5.201,can't wait
+4.781,not bad
… 2163657 more positive …,… 2163657 more positive …
… 1912120 more negative …,… 1912120 more negative …
-4.487,missed
-4.505,bummer
-4.678,died
-4.718,sorry
-4.766,no
