In [188]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from textblob import TextBlob 

In [189]:
df = pd.read_csv('uncleanedds.csv')

In [190]:
df = df.drop(columns = ["Unnamed: 0"])
df = df[~df['full_text'].str.startswith('RT')]
df = df[~df['full_text'].str.startswith('LIVE')]
df = df[~df['full_text'].str.startswith('ICYMI')]
df = df[~df['full_text'].str.startswith('WATCH')]
df['no_punc_text'] = df['full_text']

In [191]:
def removeatusernames(tweet):
    return re.sub(r'(@[A-Za-z0-9]+)','',tweet)

In [192]:
def removelinks(tweet):
    return re.sub(r'http\S+','',tweet)

In [193]:
def removehashtags(tweet):
    return re.sub(r'(#[A-Za-z0-9]+)','',tweet)

In [194]:
df['full_text'] = df['full_text'].apply(removelinks).apply(removeatusernames).apply(removehashtags)

In [195]:
df = df[['full_text']]
df.head()

Unnamed: 0,full_text
0,real protective wit my soul where u been
1,what’s your fav lyric 🌪🌬
2,we do but it’s both u feel me
3,we love u 🖤 thank u 🖤 !! ヽ( ⌒o⌒)人(⌒○⌒ )ﾉ
4,u deserve the world ! only up from here chicoo...


# Using Vader Package

In [196]:
from textblob import TextBlob
sentiment_objects = [TextBlob(tweet) for tweet in df['full_text']]
sentiment_values = [[tweet.sentiment.polarity, str(tweet)] for tweet in sentiment_objects]
sentiment_df = pd.DataFrame(sentiment_values, columns=["polarity", "text"])


In [197]:
maxval = sentiment_df['polarity'].max()
minval = sentiment_df['polarity'].min()
def normalize(x):

    return (x -minval)/(maxval-minval)

In [198]:
sentiment_df['normalized_polarity'] = sentiment_df['polarity'].apply(normalize)

In [199]:
sentiment_df

Unnamed: 0,polarity,text,normalized_polarity
0,0.200000,real protective wit my soul where u been,0.600000
1,0.250000,what’s your fav lyric 🌪🌬,0.625000
2,0.000000,we do but it’s both u feel me,0.500000
3,0.781250,we love u 🖤 thank u 🖤 !! ヽ( ⌒o⌒)人(⌒○⌒ )ﾉ,0.890625
4,0.000000,u deserve the world ! only up from here chicoo...,0.500000
5,0.000000,🖤🖤🖤,0.500000
6,0.000000,🥺 i like it now,0.500000
7,0.500000,i haven’t before and still don’t feel the nee...,0.750000
8,0.350000,a full dad wrote this,0.675000
9,0.000000,SHE DA CHICOOOOO,0.500000


In [200]:
def moods(x):
    if 0 < x <= 0.1:
        return 0
    elif 0.1 < x <= 0.2:
        return 1
    elif 0.2 < x <= 0.3:
        return 2
    elif 0.3 < x <= 0.4:
        return 3
    elif 0.4 < x <= 0.5:
        return 4
    elif 0.5 < x <= 0.6:
        return 5
    elif 0.6 < x <= 0.7:
        return 6
    elif 0.7 < x <= 0.8:
        return 7
    elif 0.8 < x <= 0.9:
        return 8
    else: 
        return 9

In [201]:
sentiment_df['sentiment'] = sentiment_df['normalized_polarity'].apply(moods)

In [202]:
X = sentiment_df['text']
y = sentiment_df['sentiment']

# LOGISTIC REGRESSION

In [203]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = 'word',lowercase = False)
features = vectorizer.fit_transform(sentiment_df['text'])
features_nd = features.toarray()
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(
        features_nd, 
        y,
        train_size=0.80)
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
log_model = log_model.fit(X=X_train, y=y_train)
y_pred = log_model.predict(X_test)



In [204]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.6924333151878062


# STEMMING

In [205]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
porter = PorterStemmer()
from nltk.tokenize import sent_tokenize, word_tokenize
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [206]:


sentiment_df['Stemming'] = sentiment_df['text'].apply(stemSentence)

sentiment_objects = [TextBlob(tweet) for tweet in sentiment_df['Stemming']]
sentiment_values = [[tweet.sentiment.polarity, str(tweet)] for tweet in sentiment_objects]













stemmmm_sent = pd.DataFrame(sentiment_values, columns=["polarity", "text"])
maxval = stemmmm_sent['polarity'].max()
minval = stemmmm_sent['polarity'].min()
stemmmm_sent['normalized_polarities'] = stemmmm_sent['polarity'].apply(normalize)
stemmmm_sent['moods'] = stemmmm_sent['normalized_polarities'].apply(moods)

In [207]:
y = stemmmm_sent['moods']

In [208]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(analyzer = 'word',lowercase = False)
features = vectorizer.fit_transform(stemmmm_sent['text'])
features_dd = features.toarray()


vectorizer = CountVectorizer(analyzer = 'word',lowercase = False)
fnd = vectorizer.fit_transform(sentiment_df['text'])
features_nd = fnd.toarray()



In [209]:

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test  = train_test_split(
        features_dd, 
        y,
        train_size=0.80)
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression()
log_model = log_model.fit(X=X_train, y=y_train)
y_pred = log_model.predict(X_test)



In [210]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.7664670658682635


# LINEAR REGRESSION

In [211]:
stemmmm_sent

Unnamed: 0,polarity,text,normalized_polarities,moods
0,0.200000,real protect wit my soul where u been,0.600000,5
1,0.250000,what ’ s your fav lyric 🌪🌬,0.625000,6
2,0.000000,we do but it ’ s both u feel me,0.500000,4
3,0.781250,we love u 🖤 thank u 🖤 ! ! ヽ ( ⌒o⌒ ) 人 ( ⌒○⌒ ) ﾉ,0.890625,8
4,0.000000,u deserv the world ! onli up from here chicooo...,0.500000,4
5,0.000000,🖤🖤🖤,0.500000,4
6,0.000000,🥺 i like it now,0.500000,4
7,0.500000,i haven ’ t befor and still don ’ t feel the n...,0.750000,7
8,0.350000,a full dad wrote thi,0.675000,6
9,0.000000,she DA chicooooo,0.500000,4


In [212]:
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

X = stemmmm_sent['text']
y = stemmmm_sent['normalized_polarities']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
word_vec = TfidfVectorizer()
word_vec.fit(X_train)
X_train_feat = word_vec.transform(X_train)
X_test_feat = word_vec.transform(X_test)
clf = Ridge(alpha=.1)
clf.fit(X_train_feat, y_train)
clf.score(X_train_feat, y_train)

0.8815174190357934

In [213]:
for i in np.linspace(0, 1, 11):
    clf = Ridge(alpha=i)
    clf.fit(X_train_feat, y_train)
    score = clf.score(X_test_feat, y_test)
    print("Accuracy " + str(i) + " " + "is " + str(score))
clf = Ridge(alpha=.7)
clf.fit(X_train_feat, y_train)
pred = clf.predict(X_test_feat)
result = X_test.to_frame()
result['True'] = y_test
result['Predicted'] = pred
result.head()

Accuracy 0.0 is 0.2612643216776631
Accuracy 0.1 is 0.6082050767449582
Accuracy 0.2 is 0.6410525593357477
Accuracy 0.30000000000000004 is 0.6550872301621142
Accuracy 0.4 is 0.6622700135217597
Accuracy 0.5 is 0.6661667661908497
Accuracy 0.6000000000000001 is 0.6681254734247877
Accuracy 0.7000000000000001 is 0.6689513201698183
Accuracy 0.8 is 0.6689454420989102
Accuracy 0.9 is 0.6683552083632038
Accuracy 1.0 is 0.6673466090651683


Unnamed: 0,text,True,Predicted
4004,woooooow the best moment in life are the one y...,1.0,0.72042
6412,when r u gon na come here ?,0.5,0.571709
8177,`` the afford care act is here to stay . '' —p...,0.5,0.507195
7612,thi is a huge step forward .,0.7,0.598468
8911,the presid obama is take will grow the economi...,0.5,0.546854


In [214]:
def difference(true, pred):
    return round(np.abs(true-pred), 3)
result['Difference'] = difference(result['True'], result['Predicted'])
def label(diff):
    if diff > .1:
        return 0
    else:
        return 1
result['Label Different'] = result['Difference'].apply(label)


In [217]:
sum(result['Label Different'])/len(result)

0.8709853021230267

In [216]:
result

Unnamed: 0,text,True,Predicted,Difference,Label Different
4004,woooooow the best moment in life are the one y...,1.000000,0.720420,0.280,0
6412,when r u gon na come here ?,0.500000,0.571709,0.072,1
8177,`` the afford care act is here to stay . '' —p...,0.500000,0.507195,0.007,1
7612,thi is a huge step forward .,0.700000,0.598468,0.102,0
8911,the presid obama is take will grow the economi...,0.500000,0.546854,0.047,1
4240,run through that gratitud list everi morn coun...,0.550000,0.521957,0.028,1
5820,_nnoko fai moi sign si tu sera a orlando,0.500000,0.541078,0.041,1
6916,"the economi ad 255,000 job in july—a record-br...",0.500000,0.522394,0.022,1
8113,"`` No matter where you live , thi is a special...",0.582251,0.656479,0.074,1
3541,we shot the night like thi video when i wa 6 m...,0.507378,0.552760,0.045,1


# Logistic Regression

In [103]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train_feat, y_train)
    score = accuracy_score(y_test, lr.predict(X_test_feat))
    print("Accuracy " + str(c) + " " + "is " + str(score))



Accuracy 0.01 is 0.5269461077844312
Accuracy 0.05 is 0.5443658138268916
Accuracy 0.25 is 0.5890038105606968
Accuracy 0.5 is 0.6243875884594448
Accuracy 1 is 0.6668481219379423


# Ridge Regression

In [104]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=.1)
clf.fit(X_train_feat, y_train)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [105]:
clf.score(X_train_feat, y_train)

0.9064244499663989

In [106]:
for i in np.linspace(0, 1, 11):
    clf = Ridge(alpha=i)
    clf.fit(X_train_feat, y_train)
    score = clf.score(X_test_feat, y_test)
    print("Accuracy " + str(i) + " " + "is " + str(score))

Accuracy 0.0 is 0.1182721681859662
Accuracy 0.1 is 0.5690893819910046
Accuracy 0.2 is 0.6026957615247559
Accuracy 0.30000000000000004 is 0.6165566865987877
Accuracy 0.4 is 0.6236567323355187
Accuracy 0.5 is 0.6274060043661668
Accuracy 0.6000000000000001 is 0.6293471359649794
Accuracy 0.7000000000000001 is 0.6301205216779031
Accuracy 0.8 is 0.6301134324325792
Accuracy 0.9 is 0.6296135184007492
Accuracy 1.0 is 0.6287403515080998


In [107]:
clf = Ridge(alpha=.7)
clf.fit(X_train_feat, y_train)
pred = clf.predict(X_test_feat)
result = X_test.to_frame()
result['True'] = y_test
result['Predicted'] = pred
result.head()

Unnamed: 0,text,True,Predicted
4004,woooooow the best moments in life are the ones...,4,4.558148
6412,When r u gonna come here?,4,4.798722
8177,"""The Affordable Care Act is here to stay."" —Pr...",4,3.941975
7612,This is a huge step forward.,7,5.221706
8911,The President Obama is taking will grow the e...,4,4.775176


In [108]:
def difference(true, pred):
    return round(np.abs(true-pred), 3)

In [111]:
result['Difference'] = difference(result['True'], result['Predicted'])


In [115]:
def label(diff):
    if diff > .1:
        return 1
    else:
        return 0

In [119]:
result['Label Different'] = result['Difference'].apply(label)


In [120]:
sum(result['Label Different'])/len(result)

0.8818726183995645

In [121]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=5, random_state=0, n_estimators=500)
regr.fit(X_train_feat, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [122]:
regr.score(X_train_feat, y_train)

0.3441139857590394

# Error

In [110]:
def automatic(X_train_feat, y_train, X_test_feat, y_test, best):
    
    clf = Ridge(alpha=best)
    clf.fit(X_train_feat, y_train)
    pred = clf.predict(X_test_feat)
    result = X_test.to_frame()
    result['True'] = y_test
    result['Predicted'] = pred
    
    def difference(true, pred):
        return round(np.abs(true-pred), 3)
    
    def label(diff):
        if diff > .1:
            return 0
        else:
            return 1
    
    result['Difference'] = difference(result['True'], result['Predicted'])
    result['Label Different'] = result['Difference'].apply(label)
    return sum(result['Label Different'])/len(result)

In [115]:
automatic(X_train_feat, y_train, X_test_feat, y_test, .4)

0.8248102936324645

In [60]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

tokenizer_obj = Tokenizer()
total_reviews = np.append(X_train.values, X_test.values)
tokenizer_obj.fit_on_texts(total_reviews)

max_length = max([len(s.split()) for s in total_reviews])

vocab_size = len(tokenizer_obj.word_index) + 1

X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')

In [23]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding

EMBEDDING_DIM = 100

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(GRU(units=32, dropout=.2, recurrent_dropout=.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Using TensorFlow backend.


In [None]:
model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Train on 6152 samples, validate on 3031 samples
Epoch 1/25
