In [76]:
import csv
import pandas as pd
import glob
import random
import numpy as np

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, log_loss, roc_curve, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.calibration import CalibratedClassifierCV

from scipy.sparse import csr_matrix


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jenniferwilson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Query used to get data from Hive to get raw data from Hadoop.

We ran this in Hadoop on WordPress.com's servers: https://mc.a8c.com/pb/213b7/#plain

In [62]:
# Each month has a different CSV of data. Here we programatically combine them into one dataframe.

files = glob.glob('Data/*.csv')

li = []

for filename in files:
    num_lines = sum(1 for l in open(filename))
    size = int(num_lines / 6 ) # use these values: 3,4,5,6
    skip_idx = random.sample(range(1, num_lines), num_lines - size)
    df = pd.read_csv(filename, skiprows=skip_idx, index_col=None, header=0)
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)

# Shuffle the data.
df = shuffle(df)

# Confirm size of dataset.
df.shape

(13312, 12)

In [63]:
# Check class balance. Looks pretty balanced!

df['plan_purchased_nice'].value_counts()

Business    5022
Premium     4718
Personal    3572
Name: plan_purchased_nice, dtype: int64

In [64]:
# Remove stopwords so as to clean up our features (vectorized text).
stop = set(stopwords.words('english'))
df['msg_whole_clean'] = df['msg_whole'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [65]:
# Create features out of text in chat transcripts.

vectorizer = CountVectorizer(ngram_range=(2, 6), analyzer ='word', max_df =.75, min_df = .05) 

features = vectorizer.fit_transform(df['msg_whole_clean'])


In [66]:
# Split into train and test segments.

X_train, X_test, y_train, y_test = train_test_split(
         features, df['plan_purchased_nice'], test_size=0.25, random_state=42)


In [67]:
# Run data through various classifiers to find the highest accuracy.

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="rbf", C=0.025, probability=True),
    CalibratedClassifierCV(LinearSVC()),
    NuSVC(probability=True),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    GradientBoostingClassifier(),
    GaussianNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis()]

# Logging for visual comparison (optional)
log_cols=["Classifier", "Accuracy", "Log Loss"]
log = pd.DataFrame(columns=log_cols)

for clf in classifiers:
    
    # For the last three classifiers above to run, we need to convert 
    # the sparse matrix generated from the countvectorizer step above
    # into a dense matrix.
    X_train = csr_matrix(X_train).todense()
    X_test = csr_matrix(X_test).todense()
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    
    print("="*30)
    print(name)
    
    print('****Results****')
    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Accuracy: {:.4%}".format(acc))
    
    train_predictions = clf.predict_proba(X_test)
    ll = log_loss(y_test, train_predictions)
    print("Log Loss: {}".format(ll))
    
    log_entry = pd.DataFrame([[name, acc*100, ll]], columns=log_cols)
    log = log.append(log_entry)
    
print("="*30)

GaussianNB
****Results****
Accuracy: 40.1142%
Log Loss: 5.28079487964
LinearDiscriminantAnalysis
****Results****
Accuracy: 45.6130%
Log Loss: 1.04323102642
QuadraticDiscriminantAnalysis
****Results****
Accuracy: 39.2127%
Log Loss: 18.678103331


In [80]:
#Start individually optimizing hyperparameters of highest performing algorithm: GradientBoostingClassifier.

# Experiment with different learning rates.
learning_rates = [1, 0.5, 0.25, 0.1, 0.05, 0.01]
print('****Results****')

for eta in learning_rates:
    clf = GradientBoostingClassifier(learning_rate=eta)

    X_train = csr_matrix(X_train).todense()
    X_test = csr_matrix(X_test).todense()
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__

    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Learning Rate: {:.4%}".format(eta))
    print("Accuracy: {:.4%}".format(acc))
    print("="*30)


****Results****
Learning Rate: 100.0000%
Accuracy: 43.6298%
****Results****
Learning Rate: 50.0000%
Accuracy: 44.5312%
****Results****
Learning Rate: 25.0000%
Accuracy: 45.9435%
****Results****
Learning Rate: 10.0000%
Accuracy: 46.7548%
****Results****
Learning Rate: 5.0000%
Accuracy: 47.4459%
****Results****
Learning Rate: 1.0000%
Accuracy: 46.6647%


In [84]:
# Experiment with different n_estimators.

n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]
print('****Results****')

for estimator in n_estimators:
    clf = GradientBoostingClassifier(n_estimators=estimator)

    X_train = csr_matrix(X_train).todense()
    X_test = csr_matrix(X_test).todense()
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__

    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("N estimators: {}".format(estimator))
    print("Accuracy: {:.4%}".format(acc))
    print("="*30)

#  Highest: 32

****Results****
N estimators: 1
Accuracy: 44.4712%
N estimators: 2
Accuracy: 44.8918%
N estimators: 4
Accuracy: 44.9219%
N estimators: 8
Accuracy: 46.6647%
N estimators: 16
Accuracy: 46.9351%
N estimators: 32
Accuracy: 47.4159%
N estimators: 64
Accuracy: 47.1454%
N estimators: 100
Accuracy: 47.2055%
N estimators: 200
Accuracy: 45.9435%


In [85]:
# Experiment with different max_depths.

max_depths = np.linspace(1, 32, 32, endpoint=True)

print('****Results****')

for max_depth in max_depths:
    clf = GradientBoostingClassifier(max_depth=max_depth)

    X_train = csr_matrix(X_train).todense()
    X_test = csr_matrix(X_test).todense()
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__

    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Max Depth: {}".format(max_depth))
    print("Accuracy: {:.4%}".format(acc))
    print("="*30)


****Results****
Max Depth: 1.0
Accuracy: 47.3858%
Max Depth: 2.0
Accuracy: 47.2957%
Max Depth: 3.0
Accuracy: 46.7548%
Max Depth: 4.0
Accuracy: 46.3642%
Max Depth: 5.0
Accuracy: 45.9736%
Max Depth: 6.0
Accuracy: 46.1238%
Max Depth: 7.0
Accuracy: 45.2224%
Max Depth: 8.0
Accuracy: 44.7716%
Max Depth: 9.0
Accuracy: 44.3810%
Max Depth: 10.0
Accuracy: 44.7416%
Max Depth: 11.0
Accuracy: 44.0805%
Max Depth: 12.0
Accuracy: 43.7500%
Max Depth: 13.0
Accuracy: 43.3293%
Max Depth: 14.0
Accuracy: 43.8702%
Max Depth: 15.0
Accuracy: 43.2692%


KeyboardInterrupt: 

In [86]:
# Experiment with different min_samples_splits.

min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)

print('****Results****')

for min_samples_split in min_samples_splits:
    clf = GradientBoostingClassifier(min_samples_split=min_samples_split)
    
    X_train = csr_matrix(X_train).todense()
    X_test = csr_matrix(X_test).todense()
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__

    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Max Samples Split: {}".format(min_samples_split))
    print("Accuracy: {:.4%}".format(acc))
    print("="*30)


****Results****
Max Samples Split: 0.1
Accuracy: 47.7464%
Max Samples Split: 0.2
Accuracy: 48.0769%
Max Samples Split: 0.3
Accuracy: 47.6562%
Max Samples Split: 0.4
Accuracy: 47.7764%
Max Samples Split: 0.5
Accuracy: 47.7163%
Max Samples Split: 0.6
Accuracy: 47.8365%
Max Samples Split: 0.7
Accuracy: 48.0469%
Max Samples Split: 0.8
Accuracy: 47.5361%
Max Samples Split: 0.9
Accuracy: 47.3858%
Max Samples Split: 1.0
Accuracy: 47.3858%


In [89]:
# Experiment with different min_samples_leafs.

min_samples_leafs = np.linspace(0.1, 0.5, 5, endpoint=True)

for min_samples_leaf in min_samples_leafs:
    clf = GradientBoostingClassifier(min_samples_leaf=min_samples_leaf)

    X_train = csr_matrix(X_train).todense()
    X_test = csr_matrix(X_test).todense()
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__

    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Max Samples Leafs: {}".format(min_samples_leaf))
    print("Accuracy: {:.4%}".format(acc))
    print("="*30)

Max Samples Leafs: 0.1
Accuracy: 41.5865%
Max Samples Leafs: 0.2
Accuracy: 40.0841%
Max Samples Leafs: 0.3
Accuracy: 39.5733%
Max Samples Leafs: 0.4
Accuracy: 39.5433%
Max Samples Leafs: 0.5
Accuracy: 38.2212%


In [91]:
# Experiment with different max_features.

max_features = list(range(1,features.shape[1]))

for max_feature in max_features:
    clf = GradientBoostingClassifier(max_features=max_feature)
    
    X_train = csr_matrix(X_train).todense()
    X_test = csr_matrix(X_test).todense()
    clf.fit(X_train, y_train)
    name = clf.__class__.__name__

    train_predictions = clf.predict(X_test)
    acc = accuracy_score(y_test, train_predictions)
    print("Max Features: {}".format(max_feature))
    print("Accuracy: {:.4%}".format(acc))
    print("="*30)

Max Features: 1
Accuracy: 45.7031%
Max Features: 2
Accuracy: 45.8233%
Max Features: 3
Accuracy: 46.7548%
Max Features: 4
Accuracy: 47.4760%
Max Features: 5
Accuracy: 46.7849%
Max Features: 6
Accuracy: 46.8450%
Max Features: 7
Accuracy: 47.2356%
Max Features: 8
Accuracy: 47.1454%
Max Features: 9
Accuracy: 46.9952%
Max Features: 10
Accuracy: 47.0252%
Max Features: 11
Accuracy: 47.2356%
Max Features: 12
Accuracy: 48.0469%
Max Features: 13
Accuracy: 47.2656%
Max Features: 14
Accuracy: 47.3257%
Max Features: 15
Accuracy: 47.7163%
Max Features: 16
Accuracy: 46.9651%
Max Features: 17
Accuracy: 47.2356%
Max Features: 18
Accuracy: 47.5060%
Max Features: 19
Accuracy: 47.3858%
Max Features: 20
Accuracy: 47.0553%
Max Features: 21
Accuracy: 47.5060%
Max Features: 22
Accuracy: 47.7163%
Max Features: 23
Accuracy: 47.1454%
Max Features: 24
Accuracy: 47.4760%
Max Features: 25
Accuracy: 47.2055%
Max Features: 26
Accuracy: 46.7849%
Max Features: 27
Accuracy: 47.4760%
Max Features: 28
Accuracy: 47.2356%
M

KeyboardInterrupt: 

In [94]:
# By combining the individual optimizations above into one algorithm
# we can see the impact this has on accuracy: 42.3077%.
# That's worse than the classifier with NO hyperparameters (see next cell).

clf = GradientBoostingClassifier(learning_rate = 0.5,
                                    n_estimators = 32,
                                    max_features = 12,
                                    min_samples_split = 0.7,
                                    min_samples_leaf = 0.1)
    
X_train = csr_matrix(X_train).todense()
X_test = csr_matrix(X_test).todense()
clf.fit(X_train, y_train)
name = clf.__class__.__name__

train_predictions = clf.predict(X_test)
acc = accuracy_score(y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))

Accuracy: 42.3077%


In [95]:
# This is the classifier run with no hyperparamters.
# Accuracy = 46.7248%

clf = GradientBoostingClassifier()
    
X_train = csr_matrix(X_train).todense()
X_test = csr_matrix(X_test).todense()
clf.fit(X_train, y_train)
name = clf.__class__.__name__

train_predictions = clf.predict(X_test)
acc = accuracy_score(y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))

Accuracy: 46.7248%


In [107]:
# Finally, we tested optimizing each of the hyperparamters below
# one by one, adding a new parameter each time to the highest performing
# from the previous run.
# Accuracy = 47.5361%

clf = GradientBoostingClassifier(learning_rate =.5,
                                 n_estimators = 8,
                                 max_depth = 2,
                                 min_samples_split = 0.2,
                                 max_features = 35)
#                                      min_samples_leaf 

X_train = csr_matrix(X_train).todense()
X_test = csr_matrix(X_test).todense()
clf.fit(X_train, y_train)
name = clf.__class__.__name__

train_predictions = clf.predict(X_test)
acc = accuracy_score(y_test, train_predictions)
print("Accuracy: {:.4%}".format(acc))



Accuracy: 47.5361%
