In [53]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
stop = stopwords.words('english')

In [54]:
train_df = pd.read_csv('data/train.csv', header = None)
train_df.columns = ['reviews','rating']
train_df.dropna(
    axis=0,
    inplace=True
)
train_df.head()

Unnamed: 0,reviews,rating
0,"Definitely runs long, wore 4.5 inch heels and ...",5
1,I think it's a little big/long in the torso.,5
2,This dress is amazing! It has a built in like...,5
3,"didn't even need a bra and I am a 34DD, was so...",5
4,I wore this to my birthday dinner and loved it...,4


In [55]:
# sid = SentimentIntensityAnalyzer()
# train_df["sentiment"] = train_df["reviews"].apply(lambda x: sid.polarity_scores(x))  
# train_df = pd.concat([train_df.drop(['sentiment'], axis=1), train_df['sentiment'].apply(pd.Series)], axis=1)
# train_df.head()

In [56]:
# train_df["word_count"] = train_df["reviews"].apply(lambda x: len(str(x).split()))
# train_df["char_count"] = train_df["reviews"].apply(lambda x: sum(len(word) for word in str(x).split()))
# train_df.head()

In [57]:
# util_cols = ["word_count", "char_count", "compound", "neg", "neu", "pos"]
# util_df = train_df[util_cols]
# train_df.drop(util_cols, axis=1, inplace=True)
# util_df.head()

In [58]:
lemma = WordNetLemmatizer()
train_df["reviews"] = train_df["reviews"].apply(lambda x: " ".join([lemma.lemmatize(word) for word in x.split()]))
train_df["reviews"] = train_df["reviews"].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
#remove punctuation
train_df["reviews"] = train_df["reviews"].str.replace('[^\w\s]','')
tfidf = TfidfVectorizer(max_features=60000, ngram_range=(1,2), use_idf=True, norm='l2', smooth_idf=True, sublinear_tf=True)
X = tfidf.fit_transform(train_df["reviews"])


  train_df["reviews"] = train_df["reviews"].str.replace('[^\w\s]','')


In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, train_df["rating"], test_size=0.2, random_state=42)
weights = pd.Series(y_train).value_counts().reset_index()
weights.columns = ['label', 'counts']
weights['weights'] = weights['counts'].sum() / weights['counts']
weights = weights.set_index('label')['weights'].to_dict()
sample_weights = pd.Series(y_train).map(weights)

In [60]:
model = LogisticRegression(C=1.0, class_weight='balanced', solver='liblinear', multi_class='ovr')

In [61]:
model.fit(X_train, y_train)


LogisticRegression(class_weight='balanced', multi_class='ovr',
                   solver='liblinear')

In [62]:
y_pred = model.predict(X_test)
predicted_categories = y_pred
predicted_categories = pd.Series(predicted_categories)
f1_micro = f1_score(y_test, predicted_categories, average='micro')
f1_macro = f1_score(y_test, predicted_categories, average='macro')
print("F1 Score = {}%".format(100.0 * (f1_micro+f1_macro)/2.0))

F1 Score = 54.39683377897418%


In [64]:
scoring = make_scorer(lambda x,y : (f1_score(x,y,average='micro')+f1_score(x,y,average='macro'))/2, greater_is_better=True)
# modelN = MultinomialNB()
modelN = LogisticRegression(class_weight='balanced', solver='liblinear', multi_class='ovr')
model_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'class_weight': ['balanced', None],
    'solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg', 'sag']
}
# model_grid = {
#     'alpha': [0.1, 1, 10, 100, 1000],
#     'fit_prior': [True, False]
# }
clf = GridSearchCV(modelN, model_grid, scoring=scoring, cv=5, verbose=1, n_jobs=-1)
best_clf = clf.fit(X_train, y_train)
print(best_clf.best_estimator_)
print(best_clf.best_params_)
print(best_clf.best_score_)
y_pred = best_clf.predict(X_test)
predicted_categories = y_pred
predicted_categories = pd.Series(predicted_categories)
f1_micro = f1_score(y_test, predicted_categories, average='micro')
f1_macro = f1_score(y_test, predicted_categories, average='macro')
print("F1 Score = {}%".format(100.0 * (f1_micro+f1_macro)/2.0))

Fitting 5 folds for each of 120 candidates, totalling 600 fits


In [None]:
model_RF = RandomForestClassifier()
model_grid = {
    'n_estimators': [100],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
clf = GridSearchCV(model_RF, model_grid, cv=5, verbose=1, n_jobs=-1)
best_clf = clf.fit(X_train, y_train)
print(best_clf.best_estimator_)
print(best_clf.best_params_)
print(best_clf.best_score_)
y_pred = best_clf.predict(X_test)
predicted_categories = y_pred
predicted_categories = pd.Series(predicted_categories)
f1_micro = f1_score(y_test, predicted_categories, average='micro')
f1_macro = f1_score(y_test, predicted_categories, average='macro')
print("F1 Score = {}%".format(100.0 * (f1_micro+f1_macro)/2.0))

Fitting 5 folds for each of 8 candidates, totalling 40 fits


KeyboardInterrupt: 