### Setup

In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

### Read in data

In [2]:
elon = pd.read_csv('elon_data.csv',parse_dates=[0], infer_datetime_format=True)
elon = elon.drop(['Unnamed: 0'], axis=1)
elon = elon.dropna(axis=0)
bored_elon = pd.read_csv('bored_elon_data.csv',parse_dates=[0], infer_datetime_format=True)
bored_elon = bored_elon.drop(['Unnamed: 0'], axis=1)
bored_elon = bored_elon.dropna(axis=0)

In [3]:
elon.head()

Unnamed: 0,Tweet,Date,Retweets
0,@RanNatanzon @Tesla @Cortica This is completel...,Tue Mar 20 18:47:20 +0000 2018,195
1,Paid respects to Masada earlier today. Live fr...,Tue Mar 20 02:20:29 +0000 2018,844
2,Learning how to pour flaming absinthe over a t...,Mon Mar 19 18:09:26 +0000 2018,970
3,@IraEhrenpreis @Tesla Thanks for your support ...,Sun Mar 18 04:31:53 +0000 2018,157
4,@TheOnion Your cruel taunts cut me deep. Deep....,Thu Mar 15 18:46:45 +0000 2018,465


### Clean it up

In [4]:
#remove punctuation from Tweet text
elon['Tweet'] = elon['Tweet'].str.replace('[^\w\s]','')
bored_elon['Tweet'] = bored_elon['Tweet'].str.replace('[^\w\s]','')

#add in label columns for data
elon['Label'] = "Elon"
bored_elon['Label'] = "BoredElon"

#join elon and bored_elon
frames = [elon, bored_elon]
df = pd.concat(frames)

In [5]:
df.head()

Unnamed: 0,Tweet,Date,Retweets,Label
0,RanNatanzon Tesla Cortica This is completely f...,Tue Mar 20 18:47:20 +0000 2018,195,Elon
1,Paid respects to Masada earlier today Live fre...,Tue Mar 20 02:20:29 +0000 2018,844,Elon
2,Learning how to pour flaming absinthe over a t...,Mon Mar 19 18:09:26 +0000 2018,970,Elon
3,IraEhrenpreis Tesla Thanks for your support ov...,Sun Mar 18 04:31:53 +0000 2018,157,Elon
4,TheOnion Your cruel taunts cut me deep Deep Bu...,Thu Mar 15 18:46:45 +0000 2018,465,Elon


### Split the data into training and testing sets

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['Tweet'], df['Label'], test_size=0.25, random_state=42, stratify=df['Label'])

### Create text features from the tweets

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvec = TfidfVectorizer(stop_words='english',ngram_range=(1, 4), min_df = .02, max_df=.95)
tvec.fit(X_train)

#transforming X_train, X_test and hold_out into dataframes
tweet_train = pd.DataFrame(tvec.transform(X_train).todense(), columns = tvec.get_feature_names())
tweet_test = pd.DataFrame(tvec.transform(X_test).todense(), columns = tvec.get_feature_names())


In [8]:
tweet_train.columns

Index(['amp', 'car', 'dont', 'falcon', 'good', 'just', 'launch', 'like',
       'model', 'new', 'people', 'rocket', 'rt', 'rt spacex', 'spacex',
       'tesla', 'teslamotors', 'time', 'yes'],
      dtype='object')

In [12]:
tweet_test.head()

Unnamed: 0,amp,car,dont,falcon,good,just,launch,like,model,new,people,rocket,rt,rt spacex,spacex,tesla,teslamotors,time,yes
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Fit the data to a logistic regression and output the results

In [10]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression

# Setup logistic regression and score train set
logreg = LogisticRegression()
logreg.fit(tweet_train, y_train)
print("Train score: ", logreg.score(tweet_train, y_train))

#using logreg on test set
y_probs = logreg.predict(tweet_test) #predict y values for X_test
print("Output for Tested Model:")
print("Confusion Matrix of Predictions: ")
print(confusion_matrix(y_test, y_probs))

#printing classification report
print("Classification Matrix: ")
print(classification_report(y_test, y_probs, target_names=["BoredElon", "Elon"]))
print("Test score: ", logreg.score(tweet_test,y_test))

Train score:  0.711085582998
Output for Tested Model:
Confusion Matrix of Predictions: 
[[ 51 317]
 [ 22 771]]
Classification Matrix: 
             precision    recall  f1-score   support

  BoredElon       0.70      0.14      0.23       368
       Elon       0.71      0.97      0.82       793

avg / total       0.71      0.71      0.63      1161

Test score:  0.708010335917
