In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!cp /content/drive/MyDrive/emd/test.csv .
!cp /content/drive/MyDrive/emd/train.csv .

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import MultinomialNB
from ast import literal_eval
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from imblearn.under_sampling import RandomUnderSampler
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import f1_score
import random
from collections import Counter
from sklearn.tree import DecisionTreeClassifier



In [4]:
seed = 0
random.seed(seed)
np.random.seed(seed)

In [5]:
train_df = pd.read_csv("train.csv")
train_df = train_df[['text','score']]
train_df['text'] = train_df['text'].apply(literal_eval)
train_df['score'] = train_df['score'].astype(int)

test_df = pd.read_csv("test.csv")
test_df = test_df[['text','score']]
test_df['text'] = test_df['text'].apply(literal_eval)
test_df['score'] = test_df['score'].astype(int)

In [None]:
train_df.head()

Unnamed: 0,text,score
0,"[enjoy, game, enough, difficulti, keep, play, ...",4
1,"[guess, good, app, like, realli, good, challen...",3
2,"[never, would, expect, level, qualiti, got, ga...",4
3,"[love, play, game, lot, fun, pa, time, quickli...",5
4,"[great, game, grandson, love, dinosaur, eat, c...",5


In [None]:
train_df['score']

0         4
1         3
2         4
3         5
4         5
         ..
500165    2
500166    5
500167    5
500168    5
500169    1
Name: score, Length: 500170, dtype: int64

In [6]:
def dummy_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=dummy_fun,
    preprocessor=dummy_fun,
    token_pattern=None,
    max_features=25000,
    min_df = 5
    )


In [None]:
pca = TruncatedSVD(n_components=500)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df['text'].tolist(), train_df['score'], test_size=0.1, stratify=train_df['score'])

In [None]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_val_tfidf = tfidf.transform(X_val)

In [None]:
X_train_tfidf.shape

(450153, 17293)

In [None]:
X_val_tfidf.shape

(50017, 17293)

In [None]:
X_train_pca = pca.fit_transform(X_train_tfidf)
X_train_pca.shape

(450153, 500)

In [None]:
X_val_pca = pca.transform(X_val_tfidf)
X_val_pca.shape

(50017, 500)

In [None]:
np.sum(pca.explained_variance_ratio_)

0.5574000819080314

In [None]:
rus = RandomUnderSampler(random_state=42)
X_res_pca, y_res_pca = rus.fit_resample(X_train_pca, y_train)
X_res_pca.shape



(136625, 500)

In [None]:
print(Counter(y_res_pca))
print(Counter(y_train))

Counter({1: 27325, 2: 27325, 3: 27325, 4: 27325, 5: 27325})
Counter({5: 227969, 4: 94393, 3: 51753, 1: 48713, 2: 27325})


In [None]:
rus = RandomUnderSampler(random_state=42)
X_res_tfidf, y_res_tfidf = rus.fit_resample(X_train_tfidf, y_train)
X_res_tfidf.shape



(136625, 17293)

In [None]:
param_grid = {
    'strategy': ['stratified', 'prior'],
}

for params in ParameterGrid(param_grid):
    clf = DummyClassifier(**params) 
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_val_tfidf)
    score = f1_score(y_pred, y_val, average='macro')
    print(params, score)


{'strategy': 'stratified'} 0.19815782735144583
{'strategy': 'prior'} 0.13447118000716685


In [None]:
for params in ParameterGrid(param_grid):
    clf = DummyClassifier(**params) 
    clf.fit(X_res_tfidf, y_res_tfidf)
    y_pred = clf.predict(X_val_tfidf)
    score = f1_score(y_pred, y_val, average='macro')
    print(params, score)


{'strategy': 'stratified'} 0.17370364636779578
{'strategy': 'prior'} 0.03906187984845751


In [None]:
param_grid = {
    'alpha': [0.001, 0.05, 0.1, 0.2, 0.5, 1, 2],
}

for params in ParameterGrid(param_grid):
    clf = MultinomialNB(**params) 
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_val_tfidf)
    score = f1_score(y_pred, y_val, average='macro')
    print(params, score)

{'alpha': 0.001} 0.3116103412435585
{'alpha': 0.05} 0.3105122059734735
{'alpha': 0.1} 0.3096435542997257
{'alpha': 0.2} 0.30806679975607076
{'alpha': 0.5} 0.301379733457689
{'alpha': 1} 0.2929292238707383
{'alpha': 2} 0.2712695013337552


In [None]:
param_grid = {
    'alpha': [0.001, 0.05, 0.1, 0.2, 0.5, 1, 2, 3, 5, 10, 20, 50, 100],
}
for params in ParameterGrid(param_grid):
    clf = MultinomialNB(**params) 
    clf.fit(X_res_tfidf, y_res_tfidf)
    y_pred = clf.predict(X_val_tfidf)
    score = f1_score(y_pred, y_val, average='macro')
    print(params, score)

{'alpha': 0.001} 0.405449763664446
{'alpha': 0.05} 0.4109673796708053
{'alpha': 0.1} 0.4135537938840884
{'alpha': 0.2} 0.41658281585262247
{'alpha': 0.5} 0.42297322755791866
{'alpha': 1} 0.4270143027427464
{'alpha': 2} 0.4303982505030586
{'alpha': 3} 0.4316434716901316
{'alpha': 5} 0.4326376702462345
{'alpha': 10} 0.4332916141108399
{'alpha': 20} 0.43458317379076616
{'alpha': 50} 0.43465836812671893
{'alpha': 100} 0.4342996523466809


In [None]:
samples = {i:2000 for i in [1,2,3,4,5]}
rus = RandomUnderSampler(samples, random_state=42)
X_knn, y_knn = rus.fit_resample(X_train_pca, y_train)
X_knn.shape



(10000, 500)

In [None]:
param_grid = {
    'min_samples_split': [2, 3, 5],
}
for params in ParameterGrid(param_grid):
    clf = DecisionTreeClassifier(**params) 
    clf.fit(X_res_tfidf, y_res_tfidf)
    y_pred = clf.predict(X_val_tfidf)
    score = f1_score(y_pred, y_val, average='macro')
    print(params, score)

{'min_samples_split': 2} 0.3251900639418596
{'min_samples_split': 3} 0.32145652866840013
{'min_samples_split': 5} 0.31938574138312215


In [None]:
param_grid = {
    'min_samples_split': [2, 3, 5],
}
for params in ParameterGrid(param_grid):
    clf = DecisionTreeClassifier(**params) 
    clf.fit(X_train_tfidf, y_train)
    y_pred = clf.predict(X_val_tfidf)
    score = f1_score(y_pred, y_val, average='macro')
    print(params, score)

{'min_samples_split': 2} 0.35363003838205803
{'min_samples_split': 3} 0.3514380325448645
{'min_samples_split': 5} 0.352342260200576


In [None]:
param_grid = {
    'min_samples_split': [2, 3, 5],
}
for params in ParameterGrid(param_grid):
    clf = DecisionTreeClassifier(**params) 
    clf.fit(X_res_pca, y_res_pca)
    y_pred = clf.predict(X_val_pca)
    score = f1_score(y_pred, y_val, average='macro')
    print(params, score)

{'min_samples_split': 2} 0.2602301003001158
{'min_samples_split': 3} 0.2614235248190827
{'min_samples_split': 5} 0.2573782273237038


In [None]:
param_grid = {
    'C': [0.1, 1, 10],
}
for params in ParameterGrid(param_grid):
    clf = SVC(verbose=True, **params) 
    clf.fit(X_res_pca, y_res_pca)
    y_pred = clf.predict(X_val_pca)
    score = f1_score(y_pred, y_val, average='macro')
    print(params, score)

[LibSVM]

### TEST

In [7]:
X_train_tfidf = tfidf.fit_transform(train_df['text'].tolist())
X_test_tfidf = tfidf.transform(test_df['text'].tolist())

In [8]:
X_train_tfidf.shape

(500170, 18223)

In [9]:
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X_train_tfidf, train_df['score'])
X_res.shape



(151805, 18223)

In [12]:
clf = DummyClassifier(strategy='stratified') 
clf.fit(X_train_tfidf, train_df['score'])
y_pred = clf.predict(X_test_tfidf)
score = f1_score(test_df['score'], y_pred, average='macro')
print(score)

0.19650686535914377


In [13]:
nb = MultinomialNB(alpha=50)
nb.fit(X_res, y_res)
nb_pred = nb.predict(X_test_tfidf)
score_nb = f1_score(test_df['score'],nb_pred,average='macro')
print('score nb', score_nb)

score nb 0.43327584677084496


In [14]:
tree = DecisionTreeClassifier(min_samples_split=2)
tree.fit(X_train_tfidf, train_df['score'])
tree_pred = tree.predict(X_test_tfidf)
score_tree = f1_score( test_df['score'],tree_pred,average='macro')
print('score treee', score_tree)

score treee 0.35383580116759833
