### Import modules

In [None]:
import nltk
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
import string

stopwords = nltk.corpus.stopwords.words('english')

print('Imports Complete')

### Read Data

In [None]:
#Here is where I read my data into pandas dataframes
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")



In [None]:
train_df.head()

### Rebalance and reduce sample
So I need to re-balanace the data since there are not an even amount of pos and neg example for training and predictions. Some friends of mine told me to always have a balance in order to prevent bias. I am also limiting the amount of features that are processed. This is due to memery limitations. When you run the code below you will see the total amount of features after the rebalanace is over 80,000. This willnot vectoize using our chosen vectorizer (TF-IDF). I have found that 50,000 is about the limit.

In [None]:

from matplotlib import pyplot
import numpy as np
%matplotlib inline

count_target_0, count_target_1 = train_df['target'].value_counts()

train_df_target_0 = train_df[train_df['target'] == 0]
train_df_target_1 = train_df[train_df['target'] == 1]

train_df_target_0_under = train_df_target_0.sample(count_target_1)
train_df_under = pd.concat([train_df_target_0_under, train_df_target_1], axis=0)

train_df_under['target'].value_counts().plot(kind='bar', title='Count (target)')

In [None]:
train_df_under['target'].value_counts()

In [None]:
sam_train_under = train_df_under.sample(50000)

### Let's get some test/train data
Here is where I split thattraing 80/20 (as denoted by the .2 in the test_size parameter) into a test set and a training set. This 'hold out' method is common.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sam_train_under[['question_text']], sam_train_under['target'], test_size=0.2)

### Vectorize
Here is where I vectorize the words in the questions. I am using teh TF-IDF vectorizer. You could also use an N-gram or CountVectorizer as an alternative. Notice that I am removing the stopwords

In [None]:
tfidf_vect = TfidfVectorizer(stop_words='english')
tfidf_vect_fit = tfidf_vect.fit(X_train['question_text'])

tfidf_train = tfidf_vect_fit.transform(X_train['question_text'])
tfidf_test = tfidf_vect_fit.transform(X_test['question_text'])

X_train_vect = pd.DataFrame(tfidf_train.toarray())
X_test_vect =  pd.DataFrame(tfidf_test.toarray())


### Model eval
This is where the rubber meets the road. I am trying out several different machine learning algorithm's to find which one works the best. Google "no free lunch machine learning" and you will understand

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import precision_recall_fscore_support as score
import time

## Start with RandomForest
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

start = time.time()
rf_model = rf.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

## Try stochastic gradient descent 

In [None]:
sgd = SGDClassifier()

start = time.time()
sgd_model = sgd.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = sgd_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

In [None]:
### Try Logistic regression

In [None]:
lr = LogisticRegression(C=0.1, solver='sag')

start = time.time()
lr_model = lr.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = lr_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

## Try GradientBoost
gb = GradientBoostingClassifier(n_estimators=150, max_depth=11)

start = time.time()
gb_model = gb.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = gb_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

## Try XGBoost
import xgboost as xgb
gbm = xgb.XGBClassifier(n_job=-1)

start = time.time()
gbm_model = gbm.fit(X_train_vect, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = gbm_model.predict(X_test_vect)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label=1, average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

**I have turned the code blocks for the RandomForest, GradientBoost and XGBoost into markdown. I did this since RF and GB takes over an hour to process and XGB runs our of memory. LogisticRegression runs REALLY fast if you use the default solver (lbfgs) and a bit slower in my case when I chose 'sag' but also had the best results**

### Build Submission 
This will load the test data and predict the values using our chosen classifier. Once that is complete, I create a submission CSV to evaluatemy results.

In [None]:
X_submission = tfidf_vect.transform(test_df['question_text'])
predicted_test = sgd.predict(X_submission)

test_df['prediction'] = predicted_test
submission = test_df.drop(columns=['question_text'])
submission.head()
submission.to_csv('submission.csv', index=False)