In [None]:
import pandas as pd
from sklearn import model_selection

# Create Folds

This is important step for blending and stacking. All data needs to be Stratified.

In [None]:
df = pd.read_csv("data/labeledTrainData.tsv", sep="\t")
df.loc[:, "kfold"] = -1
df = df.sample(frac=1).reset_index(drop=True)

y = df.sentiment.values
skf = model_selection.StratifiedKFold(n_splits=5)

In [None]:
for f, (t_, v_) in enumerate(skf.split(X=df, y=y)):
    df.loc[v_, "kfold"] = f

df.to_csv("data/train_folds.csv", index=False)

# Check the Folds

In [None]:
folds = pd.read_csv("data/train_folds.csv")
folds.kfold.value_counts()

# Model 1: Logistic Regression Model (TfidfVectorizer)

In [None]:
from sklearn import linear_model, metrics
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def run_training(fold):
    df = pd.read_csv("data/train_folds.csv")
    df.review = df.review.apply(str)

    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    tfv = TfidfVectorizer()
    tfv.fit(df_train.review.values)

    xtrain = tfv.transform(df_train.review.values)
    xvalid = tfv.transform(df_valid.review.values)

    ytrain = df_train.sentiment.values
    yvalid = df_valid.sentiment.values

    clf = linear_model.LogisticRegression()
    clf.fit(xtrain, ytrain)
    pred = clf.predict_proba(xvalid)[:, 1]

    auc = metrics.roc_auc_score(yvalid, pred)
    print("fold : %2d, auc : %f" % (fold, auc))

    df_valid.loc[:, "lr_tf_pred"] = pred

    #     return df_valid
    return df_valid[["id", "sentiment", "kfold", "lr_tf_pred"]]

In [None]:
dfs = []
for j in range(5):
    temp_df = run_training(j)
    dfs.append(temp_df)

In [None]:
fin_valid_df = pd.concat(dfs)
print(fin_valid_df.shape)
fin_valid_df.to_csv("data/lr_tf.csv", index=False)

In [None]:
fin_valid_df.head(5)

All output from rest of the model should like the above output.

# Model 2: Logistic Regression Model (CountVectorizer)

In [None]:
from sklearn import linear_model, metrics
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
def run_training(fold):
    df = pd.read_csv("data/train_folds.csv")
    df.review = df.review.apply(str)

    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    tfv = CountVectorizer()
    tfv.fit(df_train.review.values)

    xtrain = tfv.transform(df_train.review.values)
    xvalid = tfv.transform(df_valid.review.values)

    ytrain = df_train.sentiment.values
    yvalid = df_valid.sentiment.values

    clf = linear_model.LogisticRegression()
    clf.fit(xtrain, ytrain)
    pred = clf.predict_proba(xvalid)[:, 1]

    auc = metrics.roc_auc_score(yvalid, pred)
    print("fold : %2d, auc : %f" % (fold, auc))

    df_valid.loc[:, "lr_cnt_pred"] = pred

    #     return df_valid
    return df_valid[["id", "sentiment", "kfold", "lr_cnt_pred"]]

In [None]:
dfs = []
for j in range(5):
    temp_df = run_training(j)
    dfs.append(temp_df)

In [None]:
fin_valid_df = pd.concat(dfs)
print(fin_valid_df.shape)
fin_valid_df.to_csv("data/lr_cnt.csv", index=False)

In [None]:
fin_valid_df.head(5)

# Model 3: RandomForestClassifier

In [None]:
from sklearn import decomposition, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
def run_training(fold):
    df = pd.read_csv("data/train_folds.csv")
    df.review = df.review.apply(str)

    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    tfv = TfidfVectorizer()
    tfv.fit(df_train.review.values)

    xtrain = tfv.transform(df_train.review.values)
    xvalid = tfv.transform(df_valid.review.values)

    svd = decomposition.TruncatedSVD(n_components=120)
    svd.fit(xtrain)

    xtrain_svd = svd.transform(xtrain)
    xvalid_svd = svd.transform(xvalid)

    ytrain = df_train.sentiment.values
    yvalid = df_valid.sentiment.values

    clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
    clf.fit(xtrain_svd, ytrain)
    pred = clf.predict_proba(xvalid_svd)[:, 1]

    auc = metrics.roc_auc_score(yvalid, pred)
    print("fold : %2d, auc : %f" % (fold, auc))

    df_valid.loc[:, "lr_svd_pred"] = pred

    #     return df_valid
    return df_valid[["id", "sentiment", "kfold", "lr_svd_pred"]]

In [None]:
dfs = []
for j in range(5):
    temp_df = run_training(j)
    dfs.append(temp_df)

In [None]:
fin_valid_df = pd.concat(dfs)
print(fin_valid_df.shape)
fin_valid_df.to_csv("data/lr_svd.csv", index=False)

# Blending

In [None]:
import glob

In [None]:
files = glob.glob("data/lr*.csv")

In [None]:
df = None
for f in files:
    if df is None:
        df = pd.read_csv(f)
    else:
        temp_df = pd.read_csv(f)
        df = df.merge(temp_df, on="id", how="left")

print(df.head(10))

In [None]:
pred_cols = ["lr_tf_pred", "lr_svd_pred", "lr_cnt_pred"]

In [None]:
for col in pred_cols:
    auc = metrics.roc_auc_score(df.sentiment.values, df[col].values)
    print("pred_col=%s, overall_auc=%f" % (col, auc))

In [None]:
import numpy as np

# Applying Average Blending. 

i.e Taking the Average of all the prediction.

In [None]:
print("Average")
avg_pred = np.mean(df[["lr_tf_pred", "lr_svd_pred", "lr_cnt_pred"]].values, axis=1)
print(metrics.roc_auc_score(df.sentiment.values, avg_pred))

# Applying Weighted Average Blending. 

i.e Applying weights to some predictions and then taking the Average of all the prediction.

In [None]:
lr_tf_pred = df.lr_tf_pred.values
lr_svd_pred = df.lr_svd_pred.values
lr_cnt_pred = df.lr_cnt_pred.values

avg_pred = (3 * lr_tf_pred + lr_cnt_pred + lr_svd_pred) / 5
print("Weighted Average")
print(metrics.roc_auc_score(df.sentiment.values, avg_pred))

# Applying Rank Average Blending. 

i.e Average of Rank of all the prediction.

In [None]:
lr_tf_pred = df.lr_tf_pred.rank().values
lr_svd_pred = df.lr_svd_pred.rank().values
lr_cnt_pred = df.lr_cnt_pred.rank().values

avg_pred = (lr_tf_pred + lr_cnt_pred + lr_svd_pred) / 3
print("Rank Average")
print(metrics.roc_auc_score(df.sentiment.values, avg_pred))

# Applying Weighted Rank Average Blending. 

i.e Applying weights to some predictions and then taking the Average of Rank of all the prediction.

In [None]:
lr_tf_pred = df.lr_tf_pred.rank().values
lr_svd_pred = df.lr_svd_pred.rank().values
lr_cnt_pred = df.lr_cnt_pred.rank().values

avg_pred = (3 * lr_tf_pred + lr_cnt_pred + lr_svd_pred) / 5
print("Weighted Rank Average")
print(metrics.roc_auc_score(df.sentiment.values, avg_pred))

Now you can keep changing the weight and check which one is giving the higher accuracy.