In [1]:
import pandas as pd

In [2]:
train_clean = pd.read_csv('./train_clean.csv')
train_clean.head()

Unnamed: 0.1,Unnamed: 0,qid,question_text,target,clean_text
0,0,00002165364db923c7e6,How did Quebec nationalists see their province...,0,quebec nationalist see province nation
1,1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0,adopted dog would encourage people adopt shop
2,2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0,velocity affect time velocity affect space geo...
3,3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0,otto von guericke used magdeburg hemisphere
4,4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0,convert montra helicon mountain bike changing ...


In [3]:
from sklearn.model_selection import train_test_split
train_clean = train_clean[['clean_text','target']]
train,test = train_test_split(train_clean,train_size=.8,stratify=train_clean['target'])

In [4]:
train_sincere = train[train['target']==0]
print(train_sincere.shape)
train_insincere = train[train['target']==1]
print(train_insincere.shape)

(980249, 2)
(64648, 2)


In [5]:
from sklearn.utils import resample
train_upsample = resample(train_insincere,replace=True,n_samples = 980249, random_state = 42)
train_upsample = pd.concat([train_sincere,train_upsample])

In [6]:
train_downsample = resample(train_sincere,replace=False,n_samples = 64648, random_state = 42)
train_downsample = pd.concat([train_insincere,train_downsample])

In [7]:
X_train = train['clean_text']
y_train = train['target']
X_test = test['clean_text']
y_test = test['target']
X_upsample = train_upsample['clean_text']
y_upsample = train_upsample['target']
X_downsample = train_downsample['clean_text']
y_downsample = train_downsample['target']

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, 
                             sublinear_tf=True, smooth_idf=True,
                             strip_accents='unicode', stop_words='english',
                             analyzer='word')
Z_train = vectorizer.fit_transform(X_train.astype('U'))
Z_test = vectorizer.transform(X_test.astype('U'))

In [9]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_validate
svm = LinearSVC()
svm.fit(Z_train,y_train)
svm.score(Z_test,y_test)

0.9518346253229975

In [10]:
from sklearn.metrics import confusion_matrix
confusion_train = confusion_matrix(y_test,svm.predict(Z_test))
prec = confusion_train[1][1]/(confusion_train[1][1]+confusion_train[0][1])
rec = confusion_train[1][1]/confusion_train[1].sum()
f1 = 2/(1/prec+1/rec)
print('confusion matrix: ',confusion_train)
print('precision: ', prec)
print('recall: ', rec)
print('f1: ', f1)

confusion matrix:  [[241210   3853]
 [  8729   7433]]
precision:  0.6586035796562112
recall:  0.45990595223363445
f1:  0.5416059457883999


In [11]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, 
                             sublinear_tf=True, smooth_idf=True,
                             strip_accents='unicode', stop_words='english',
                             analyzer='word')
Z_train = vectorizer.fit_transform(X_upsample.astype('U'))
Z_test = vectorizer.transform(X_test.astype('U'))

In [12]:
svm = LinearSVC()
svm.fit(Z_train,y_upsample)
svm.score(Z_test,y_test)

0.9437190161737965

In [13]:
confusion_upsample = confusion_matrix(y_test,svm.predict(Z_test))
prec = confusion_upsample[1][1]/(confusion_upsample[1][1]+confusion_upsample[0][1])
rec = confusion_upsample[1][1]/confusion_upsample[1].sum()
f1 = 2/(1/prec+1/rec)
print('confusion matrix: ',confusion_upsample)
print('precision: ', prec)
print('recall: ', rec)
print('f1: ', f1)

confusion matrix:  [[238612   6451]
 [  8251   7911]]
precision:  0.5508285754073249
recall:  0.4894814998143794
f1:  0.5183462193683659


In [14]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, 
                             sublinear_tf=True, smooth_idf=True,
                             strip_accents='unicode', stop_words='english',
                             analyzer='word')
Z_train = vectorizer.fit_transform(X_downsample.astype('U'))
Z_test = vectorizer.transform(X_test.astype('U'))

In [15]:
svm = LinearSVC()
svm.fit(Z_train,y_downsample)
svm.score(Z_test,y_test)

0.8723935304813858

In [16]:
confusion_downsample = confusion_matrix(y_test,svm.predict(Z_test))
prec = confusion_downsample[1][1]/(confusion_downsample[1][1]+confusion_downsample[0][1])
rec = confusion_downsample[1][1]/confusion_downsample[1].sum()
f1 = 2/(1/prec+1/rec)
print('confusion matrix: ',confusion_downsample)
print('precision: ', prec)
print('recall: ', rec)
print('f1: ', f1)

confusion matrix:  [[213731  31332]
 [  2002  14160]]
precision:  0.311263518860459
recall:  0.8761291919316916
f1:  0.45933759366788857


Forcing balancing via both up and down sampling seems to be reducing accuracy on the training set

In [17]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9, 
                             sublinear_tf=True, smooth_idf=True,
                             strip_accents='unicode', stop_words='english',
                             analyzer='word')
Z_train = vectorizer.fit_transform(X_train.astype('U'))
Z_test = vectorizer.transform(X_test.astype('U'))

In [18]:
svm = LinearSVC(class_weight='balanced')
svm.fit(Z_train,y_train)
svm.score(Z_test,y_test)

0.9206507799789454

In [19]:
confusion_balanced = confusion_matrix(y_test,svm.predict(Z_test))
prec = confusion_balanced[1][1]/(confusion_balanced[1][1]+confusion_balanced[0][1])
rec = confusion_balanced[1][1]/confusion_balanced[1].sum()
f1 = 2/(1/prec+1/rec)
print('confusion matrix: ',confusion_balanced)
print('precision: ', prec)
print('recall: ', rec)
print('f1: ', f1)

confusion matrix:  [[228321  16742]
 [  3986  12176]]
precision:  0.42105263157894735
recall:  0.753372107412449
f1:  0.5401952085181899


The balanced parameter seems to lead to lower recall.