In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_tv = pd.read_csv('data/train_folds.csv')
df_tt = pd.read_csv('data/test.tsv', sep='\t')
df_tt['phrase'] = df_tt['phrase'].apply(lambda s: s.replace("n't", "not"))

In [3]:
classifiers = {
    'lr': LogisticRegression,
    'lsvc': LinearSVC,
    'mnb': MultinomialNB,
    'sgd': SGDClassifier
}

best_params = {
    'lr': {'C': 10.0},
    'lsvc': {'C': 1.0},
    'mnb': {'alpha': 0.4},
    'sgd': {'alpha': 1e-05, 'penalty': 'l2'}
}

vectorizer = TfidfVectorizer(
    token_pattern=r'\b[^\d\W][^\d\W]+\b',
    ngram_range=(1, 2),
    max_df=0.9,
    lowercase=True
)

In [None]:
for kfold in range(5):
    df_tr = df_tv[df_tv['kfold'] != kfold]
    df_vl = df_tv[df_tv['kfold'] == kfold]
    xtr = vectorizer.fit_transform(df_tr['Phrase'])
    xvl = vectorizer.transform(df_vl['Phrase'])
    ytr = df_tr['Sentiment']
    yvl = df_vl['Sentiment']
    print(f'kfold: {kfold}')
    print('*'*100)
    for name, classifer in classifiers.items():
        ypred = classifer(**best_params[name]).fit(xtr, ytr).predict(xvl)
        df_tv.loc[df_vl.index, name] = ypred
        print(f'accuracy for {name}: {accuracy_score(yvl, ypred)}')


df_tv.to_csv('data/train_meta.csv', columns=['PhraseId', 'lr', 'lsvc', 'mnb', 'sgd', 'Sentiment'], index=False)

In [7]:
xtv = vectorizer.fit_transform(df_tv['Phrase'])
xtt = vectorizer.transform(df_tt['Phrase'])
ytv = df_tr['Sentiment']
for name, classifer in classifiers.items():
    ytt = classifer(**best_params[name]).fit(xtr, ytr).predict(xtt)
    df_tt[name] = ytt

df_tt.to_csv('data/test_meta.csv', columns=['PhraseId', 'lr', 'lsvc', 'mnb', 'sgd'], index=False)