In [None]:
# *******************************************************************************************
# ************** IMPORT VARIOUS PACKAGES USED IN THIS NOTEBOOK  *****************************
# *******************************************************************************************

import pandas as pd
from pandas import DataFrame
import csv as csv
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import matplotlib.pyplot as plt
import lightgbm as lgb

In [None]:
# *******************************************************************************************
# ************** READ IN DATA AND CONDUCT EXPLORATORY DATA ANALYSIS *************************
# *******************************************************************************************

In [None]:
# read in training data 
train = pd.read_csv("../input/train.tsv",sep='\t')

In [None]:
train.shape

# There are 156,060 rows and 4 columns in the training data set

In [None]:
# Look at first 5 rows

train.head(5)

In [None]:
# read in test data file

test = pd.read_csv("../input/test.tsv",sep='\t')

In [None]:
test.shape

# There are 66,292 rows and 3 columns in the testing data set

In [None]:
# Look at first 5 rows.  This is easier to view in the tsv file directly since the volume of data is very small.

test.head(5)

In [None]:
# read in submission data file.  This is the file I will overwrite when I submit results

submission = pd.read_csv("../input/sampleSubmission.csv",sep=',')

In [None]:
# Look at the first 5 rows.  Note that they are all initially populated with scores of '2' for sentiment

submission.head(5)

In [None]:
# What's the distribution of the sentiment scores in the training data?

train['Sentiment'].value_counts()

# Not surprising. Only 7,072 are negative(0) and 9,206 are positive(4). Most fall in between. 2 (neutral) 

In [None]:
# Let's plot that in a histogram to make it easier to view

plt.hist(train['Sentiment'])

In [None]:
# *******************************************************************************************
# ************************** DATA PREPARATION FOR MODELING **********************************
# *******************************************************************************************

In [None]:
# tifidf is used to determine how important a word is in the document
# Based on two factors.  How common it is (term frequency) and how rare it is - rarer is better (inverse document 
# frequency, which penalizes common words like 'the' and highly weighs less common words) 

In [None]:
# tfidf vectorizer converts words into a matrix of TF-IDF features.  It removes common words like 'a', and 'the'.

tfidf = TfidfVectorizer(stop_words='english')

In [None]:
# Let's run it on the phrases column of the training data.

X_train = tfidf.fit_transform(train.Phrase)

In [None]:
# We now have a matrix of 0's and 1's.  For each of the 156,060 rows, we now have 14,955 columns.  
X_train

In [None]:
print(X_train)

# Note that the first phrase (represented as row 0 below) has 11 core words.  

# "A series of escapades demonstrating the adage that what is good for the goose is also
# good for the gander , some of which occasionally amuses but none of which amounts to much of a story." -->

# Possibly the 11 core words are these (note that filler words are stripped out)
# series, escapades, demonstrating, adage, good, goose, gander, occassionally, amuses, amounts, story

# .228 is the tfidf score for the relative importance of this set of words in the training set.  

# Note that the next phrase (represented as row 1 below) is exactly the same other than it only has 6 of the core words.  

# 'A series of escapades demonstrating the adage that what is good for the goose'

# Possibly it's these...
# series, escapades, demonstrating, adage, good, goose

# Note that each of these words in the matrix (indicated by the second number) consists of the same list as above (11645, 4504, etc.).
# tfidf score is .360, so this is considered more important than the first sentence.

# The next phrase (represented as row 2 below) is 'A series'.  And so on...

In [None]:
# Run the same vectorizer on the test data

X_test = tfidf.transform(test.Phrase)

In [None]:
X_test

# For each of the 66,292 rows, we have 14,955 columns.  I believe the reason we're coming up with the same number
# of columns we had in the training data is that we're using transform, and not fit_transform.  Fit_transform
# creates the original matrix based on training data. Transform then takes the test data and fits it to that.

In [None]:
# Represent the sentiment scores in each row of the training data as 'y' 
y = train.Sentiment

In [None]:
# Split the training data into a train/validation set to iteratively improve the performance of the model.  Will do a 70/30 split

xtrain, xvalid, ytrain, yvalid = train_test_split(X_train, y, 
                                                  stratify=y, 
                                                  random_state=1, 
                                                  test_size=0.3, shuffle=True)

In [None]:
# *******************************************************************************************
# ********************************* DATA MODELING -Linear  SVC ******************************
# *******************************************************************************************

In [None]:
# Train model on full training data, predict test values based on this model
svc = LinearSVC(dual=False).fit(X_train,y).predict(X_test) 

In [None]:
# Add predictions from linear SVC model to a new data frame (df) in a column named 'svc'
df = pd.DataFrame(svc,columns=['svc'])

In [None]:
# Submit predictions from linear SVC

submission['Sentiment'] = df['svc']
submission.to_csv("svc.csv", index=False)

In [None]:
plt.hist(df['svc'])
# Seems to be close to the training set distribution - neutral slightly overstated  

In [None]:
# *******************************************************************************************
# ********************************* DATA MODELING -Logistic Regression **********************
# *******************************************************************************************

In [None]:
# Train model on full training data, predict test values based on this model
lr=LogisticRegression()
lr.fit(X_train,y)
y_pred_lr = lr.predict(X_test)

In [None]:
# Add predictions from logistic regression model to data frame in a column named 'lr'
df['lr'] = y_pred_lr

In [None]:
plt.hist(df['lr'])

In [None]:
# Submit predictions from logistic regression

submission['Sentiment'] = df['lr']
submission.to_csv("lr.csv", index=False)

In [None]:
# *******************************************************************************************
# ********************************* DATA MODELING - XG Boost ********************************
# *******************************************************************************************

In [None]:
# Runs a XG Boost model using the training matrix and corresponding sentiments to predict the values of the testing data

xgb = xgb.XGBClassifier(max_depth=14, n_estimators=500, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1)
xgb.fit(xtrain, ytrain)
predictions = xgb.predict(xvalid)

print("accuracy_score",accuracy_score(yvalid, predictions))

In [None]:
# Run the best model we can define above on the full test set to predict the sentiment.  

xgb = xgb.predict(X_test)
df = pd.DataFrame(xgb,columns=['xgb'])

In [None]:
submission['Sentiment'] = df['xgb']

In [None]:
submission.to_csv("xgb.csv", index=False)

In [None]:
plt.hist(df['xgb'])

# Not great... massively overestimates neutral sentiment

In [None]:
# *******************************************************************************************
# ******************************* DATA MODELING - LIGHT GBM *********************************
# *******************************************************************************************

In [None]:
lgb = lgb.LGBMClassifier(boosting_type='dart',
                         num_leaves=800,
                        learning_rate=0.05,
                        n_estimators=800,
                        colsample_bytree=.8,
                        num_boost_rounds=800)

# Parameters need to be tuned... this is likely to be significantly overfitting, but scored reasonably well.

In [None]:
lgb.fit(xtrain, ytrain,
        eval_set=[(xvalid, yvalid)],
        eval_metric='multi_logloss',
        early_stopping_rounds=5)

In [None]:
# Predict the values of the valdation set to understand how accurate the model is.

predictions = lgb.predict(xvalid, num_iteration = lgb.best_iteration_)
print("accuracy_score",accuracy_score(yvalid, predictions))

In [None]:
# Predict test values based on the best iteration from above 
df['lgbm'] = lgb.predict(X_test, num_iteration=lgb.best_iteration_)

In [None]:
# Submit predictions from light gradient boosting model

submission['Sentiment'] = df['lgbm']
submission.to_csv("lgbm.csv", index=False)

In [None]:
plt.hist(df['lgbm'])

# Eyeball- seems to overestimate neutral sentiment

In [None]:
# *******************************************************************************************
# ********************************* BLEND THE MODELS ****************************************
# *******************************************************************************************

In [None]:
# Check that predictions have already been added to a data frame
df.head()

In [None]:
# Let's take a majority vote.  Take the most commonly occuring sentiment per row (mode).  
# Ensure that values are integers, not floats

submission['Sentiment'] = df.mode(axis=1)
submission['Sentiment'] = submission.Sentiment.astype(int)

In [None]:
plt.hist(submission['Sentiment'])

In [None]:
# *******************************************************************************************
# ********************************* SUBMIT FILE *********************************************
# *******************************************************************************************

In [None]:
submission.to_csv("blended.csv", index=False)