In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
elon = pd.read_csv('elon_data.csv',parse_dates=[0], infer_datetime_format=True)
elon = elon.drop(['Unnamed: 0'], axis=1)
elon = elon.dropna(axis=0)
bored_elon = pd.read_csv('bored_elon_data.csv',parse_dates=[0], infer_datetime_format=True)
bored_elon = bored_elon.drop(['Unnamed: 0'], axis=1)
bored_elon = elon.dropna(axis=0)

In [None]:
bored_elon.head()

Unnamed: 0,Tweet,Date,Retweets
0,@RanNatanzon @Tesla @Cortica This is completel...,Tue Mar 20 18:47:20 +0000 2018,195
1,Paid respects to Masada earlier today. Live fr...,Tue Mar 20 02:20:29 +0000 2018,844
2,Learning how to pour flaming absinthe over a t...,Mon Mar 19 18:09:26 +0000 2018,970
3,@IraEhrenpreis @Tesla Thanks for your support ...,Sun Mar 18 04:31:53 +0000 2018,157
4,@TheOnion Your cruel taunts cut me deep. Deep....,Thu Mar 15 18:46:45 +0000 2018,465


In [None]:
#remove punctuation from Tweet text
elon['Tweet'] = elon['Tweet'].str.replace('[^\w\s]','')
bored_elon['Tweet'] = bored_elon['Tweet'].str.replace('[^\w\s]','')

#add in label columns for data
elon['Label'] = "Elon"
bored_elon['Label'] = "BoredElon"

#join elon and bored_elon
frames = [elon, bored_elon]
df = pd.concat(frames)

In [None]:
df.head()

Unnamed: 0,Tweet,Date,Retweets,Label
0,RanNatanzon Tesla Cortica This is completely f...,Tue Mar 20 18:47:20 +0000 2018,195,Elon
1,Paid respects to Masada earlier today Live fre...,Tue Mar 20 02:20:29 +0000 2018,844,Elon
2,Learning how to pour flaming absinthe over a t...,Mon Mar 19 18:09:26 +0000 2018,970,Elon
3,IraEhrenpreis Tesla Thanks for your support ov...,Sun Mar 18 04:31:53 +0000 2018,157,Elon
4,TheOnion Your cruel taunts cut me deep Deep Bu...,Thu Mar 15 18:46:45 +0000 2018,465,Elon


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Tweet'], df['Label'], test_size=0.25, random_state=42, stratify=df['Label'])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(stop_words='english',ngram_range=(1, 5), min_df = .02, max_df=.95)
tvec.fit(X_train)

#transforming X_train, X_test and hold_out into dataframes
tweet_train = pd.DataFrame(tvec.transform(X_train).todense(), columns = tvec.get_feature_names())
tweet_test = pd.DataFrame(tvec.transform(X_test).todense(), columns = tvec.get_feature_names())
#tweet_holdout = pd.DataFrame(cvec.transform(hold_total["Tweet"]).todense(),columns=cvec.get_feature_names())

In [None]:
tweet_train.columns

Index(['amp', 'car', 'dragon', 'elonmusk', 'falcon', 'good', 'great', 'just',
       'landing', 'launch', 'like', 'model', 'new', 'rocket', 'rt',
       'rt spacex', 'rt teslamotors', 'spacex', 'tesla', 'teslamotors', 'yes'],
      dtype='object')

In [None]:
tweet_test.head()

Unnamed: 0,amp,car,dragon,elonmusk,falcon,good,great,just,landing,launch,...,model,new,rocket,rt,rt spacex,rt teslamotors,spacex,tesla,teslamotors,yes
0,0.694188,0.0,0.0,0.0,0.0,0.0,0.0,0.719794,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

# Setup logistic regression and score train set
logreg = LogisticRegression()
logreg.fit(tweet_train, y_train)
print("Train score: ", logreg.score(tweet_train, y_train))

#using logreg on test set
y_probs = logreg.predict(tweet_test) #predict y values for X_test
print("Output for Tested Model:")
print("Confusion Matrix of Predictions: ")
print(confusion_matrix(y_test, y_probs))

#printing classification report
print("Classification Matrix: ")
print(classification_report(y_test, y_probs, target_names=["BoredElon", "Elon"]))
print("Test score: ", logreg.score(tweet_test,y_test))

Train score:  0.513763395671
Output for Tested Model:
Confusion Matrix of Predictions: 
[[256 538]
 [321 472]]
Classification Matrix: 
             precision    recall  f1-score   support

  BoredElon       0.44      0.32      0.37       794
       Elon       0.47      0.60      0.52       793

avg / total       0.46      0.46      0.45      1587

Test score:  0.45872715816


In [None]:
from sklearn import svm
# Setup svm and score train set
clf = svm.SVC()
clf.fit(tweet_train, y_train)  
print("Train score: ", clf.score(tweet_train, y_train))

#using logreg on test set
y_probs = clf.predict(tweet_test) #predict y values for X_test
print("Output for Tested Model:")
print("Confusion Matrix of Predictions: ")
print(confusion_matrix(y_test, y_probs))

#printing classification report
print("Classification Matrix: ")
print(classification_report(y_test, y_probs, target_names=["BoredElon", "Elon"]))
print("Test score: ", clf.score(tweet_test,y_test))

Train score:  0.513133011137
Output for Tested Model:
Confusion Matrix of Predictions: 
[[182 612]
 [244 549]]
Classification Matrix: 
             precision    recall  f1-score   support

  BoredElon       0.43      0.23      0.30       794
       Elon       0.47      0.69      0.56       793

avg / total       0.45      0.46      0.43      1587

Test score:  0.460617517328


In [None]:

import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import RandomizedSearchCV

# #############################################################################
# Train classifiers
#
# For an initial search, a logarithmic grid with basis
# 10 is often helpful. Using a basis of 2, a finer
# tuning can be achieved but at a much higher cost.

C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = RandomizedSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(tweet_train, y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

# Now we need to fit a classifier for all parameters in the 2d version
# (we use a smaller set of parameters here because it takes a while to train)

C_2d_range = [1e-2, 1, 1e2]
gamma_2d_range = [1e-1, 1, 1e1]
classifiers = []
for C in C_2d_range:
    for gamma in gamma_2d_range:
        clf = SVC(C=C, gamma=gamma)
        clf.fit(tweet_train, y_train)
        classifiers.append((C, gamma, clf))

# #############################################################################
# Visualization
#
# draw visualization of parameter effects

plt.figure(figsize=(8, 6))
xx, yy = np.meshgrid(np.linspace(-3, 3, 200), np.linspace(-3, 3, 200))
for (k, (C, gamma, clf)) in enumerate(classifiers):
    # evaluate decision function in a grid
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # visualize decision function for these parameters
    plt.subplot(len(C_2d_range), len(gamma_2d_range), k + 1)
    plt.title("gamma=10^%d, C=10^%d" % (np.log10(gamma), np.log10(C)),
              size='medium')

    # visualize parameter's effect on decision function
    plt.pcolormesh(xx, yy, -Z, cmap=plt.cm.RdBu)
    plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y_2d, cmap=plt.cm.RdBu_r,
                edgecolors='k')
    plt.xticks(())
    plt.yticks(())
    plt.axis('tight')

scores = grid.cv_results_['mean_test_score'].reshape(len(C_range),
                                                     len(gamma_range))

# Draw heatmap of the validation accuracy as a function of gamma and C
#
# The score are encoded as colors with the hot colormap which varies from dark
# red to bright yellow. As the most interesting scores are all located in the
# 0.92 to 0.97 range we use a custom normalizer to set the mid-point to 0.92 so
# as to make it easier to visualize the small variations of score values in the
# interesting range while not brutally collapsing all the low score values to
# the same color.

plt.figure(figsize=(8, 6))
plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
           norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(len(gamma_range)), gamma_range, rotation=45)
plt.yticks(np.arange(len(C_range)), C_range)
plt.title('Validation accuracy')
plt.show()

TypeError: __init__() got an unexpected keyword argument 'param_grid'