In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from textblob import TextBlob 

In [2]:
df = pd.read_csv('uncleanedds.csv')

In [3]:
df = df.drop(columns = ["Unnamed: 0"])
df = df[~df['full_text'].str.startswith('RT')]
df = df[~df['full_text'].str.startswith('LIVE')]
df = df[~df['full_text'].str.startswith('ICYMI')]
df = df[~df['full_text'].str.startswith('WATCH')]
df['no_punc_text'] = df['full_text']

In [4]:
def removeatusernames(tweet):
    return re.sub(r'(@[A-Za-z0-9]+)','',tweet)

In [5]:
def removelinks(tweet):
    return re.sub(r'http\S+','',tweet)

In [6]:
def removehashtags(tweet):
    return re.sub(r'(#[A-Za-z0-9]+)','',tweet)

In [7]:
df['full_text'] = df['full_text'].apply(removelinks).apply(removeatusernames).apply(removehashtags)

In [8]:
df = df[['full_text']]
df.head()

Unnamed: 0,full_text
0,real protective wit my soul where u been
1,what’s your fav lyric 🌪🌬
2,we do but it’s both u feel me
3,we love u 🖤 thank u 🖤 !! ヽ( ⌒o⌒)人(⌒○⌒ )ﾉ
4,u deserve the world ! only up from here chicoo...


# Using Vader Package

In [9]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
def sentiment_analyzer_scores(sentence):
    score = analyser.polarity_scores(sentence)
    if score['neg'] > score['pos']:
        return 0
    else:
        return 1

In [10]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()
def sentiment_analyzer_scores_1(sentence):
    score = analyser.polarity_scores(sentence)
    return score['compound']

In [11]:
sentiment_analyzer_scores_1("we love u 🖤 thank u 🖤 !! ヽ( ⌒o⌒)人(⌒○⌒ )ﾉ")

0.8065

In [12]:
df['sentiment_label'] = df['full_text'].apply(sentiment_analyzer_scores_1)

In [13]:
df.head()

Unnamed: 0,full_text,sentiment_label
0,real protective wit my soul where u been,0.0
1,what’s your fav lyric 🌪🌬,0.4588
2,we do but it’s both u feel me,0.0
3,we love u 🖤 thank u 🖤 !! ヽ( ⌒o⌒)人(⌒○⌒ )ﾉ,0.8065
4,u deserve the world ! only up from here chicoo...,0.0


In [14]:
analysis = TextBlob("blues")
analysis.polarity

0.0

In [15]:
def labeler(text):
    analysis = TextBlob(text)
    return analysis.polarity

In [16]:
df['TextBlob Label'] = df['full_text'].apply(labeler)

In [17]:
def normalized(arr):
    return (arr - min(arr)) / (max(arr)-min(arr))

In [18]:
df['Normalized Label'] = normalized(df['TextBlob Label'])

In [19]:
X = df['full_text']
y = df['Normalized Label']

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
word_vec = TfidfVectorizer()
word_vec.fit(X_train)
X_train_feat = word_vec.transform(X_train)
X_test_feat = word_vec.transform(X_test)

# Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X_train_feat, y_train)
    score = accuracy_score(y_test, lr.predict(X_test_feat))
    print("Accuracy " + str(c) + " " + "is " + str(score))



ValueError: Unknown label type: 'continuous'

# Ridge Regression

In [24]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=.1)
clf.fit(X_train_feat, y_train)

Ridge(alpha=0.1, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [25]:
clf.score(X_train_feat, y_train)

0.9248266188459918

In [26]:
for i in np.linspace(0, 1, 11):
    clf = Ridge(alpha=i)
    clf.fit(X_train_feat, y_train)
    score = clf.score(X_test_feat, y_test)
    print("Accuracy " + str(i) + " " + "is " + str(score))

Accuracy 0.0 is 0.2038057332848653
Accuracy 0.1 is 0.5802550617546007
Accuracy 0.2 is 0.6074765619171318
Accuracy 0.30000000000000004 is 0.6175113976101022
Accuracy 0.4 is 0.6219092531166448
Accuracy 0.5 is 0.6235991426781967
Accuracy 0.6000000000000001 is 0.6238905312801539
Accuracy 0.7000000000000001 is 0.6233126145663949
Accuracy 0.8 is 0.6221583454155624
Accuracy 0.9 is 0.6206825884002005
Accuracy 1.0 is 0.6189070800265137


In [27]:
clf = Ridge(alpha=.7)
clf.fit(X_train_feat, y_train)
pred = clf.predict(X_test_feat)
result = X_test.to_frame()
result['True'] = y_test
result['Predicted'] = pred
result.head()

Unnamed: 0,full_text,True,Predicted
4375,woooooow the best moments in life are the ones...,0.466667,0.525436
7296,When r u gonna come here?,0.5,0.582177
9396,"""The Affordable Care Act is here to stay."" —Pr...",0.5,0.51849
8718,This is a huge step forward.,0.7,0.604156
10288,The President Obama is taking will grow the e...,0.5,0.53696


In [28]:
def difference(true, pred):
    return round(np.abs(true-pred), 3)

In [29]:
result['Difference'] = difference(result['True'], result['Predicted'])

In [35]:
def label(diff):
    if diff > .1:
        return 0
    else:
        return 1

In [36]:
result['Label Different'] = result['Difference'].apply(label)

In [37]:
sum(result['Label Different'])/len(result)

0.8307489277466182

In [33]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=5, random_state=0, n_estimators=500)
regr.fit(X_train_feat, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=5,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [34]:
regr.score(X_train_feat, y_train)

0.31713671482453115

# Error

In [110]:
def automatic(X_train_feat, y_train, X_test_feat, y_test, best):
    
    clf = Ridge(alpha=best)
    clf.fit(X_train_feat, y_train)
    pred = clf.predict(X_test_feat)
    result = X_test.to_frame()
    result['True'] = y_test
    result['Predicted'] = pred
    
    def difference(true, pred):
        return round(np.abs(true-pred), 3)
    
    def label(diff):
        if diff > .1:
            return 0
        else:
            return 1
    
    result['Difference'] = difference(result['True'], result['Predicted'])
    result['Label Different'] = result['Difference'].apply(label)
    return sum(result['Label Different'])/len(result)

In [115]:
automatic(X_train_feat, y_train, X_test_feat, y_test, .4)

0.8248102936324645

In [60]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

tokenizer_obj = Tokenizer()
total_reviews = np.append(X_train.values, X_test.values)
tokenizer_obj.fit_on_texts(total_reviews)

max_length = max([len(s.split()) for s in total_reviews])

vocab_size = len(tokenizer_obj.word_index) + 1

X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_length, padding='post')

In [23]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, GRU
from keras.layers.embeddings import Embedding

EMBEDDING_DIM = 100

model = Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length=max_length))
model.add(GRU(units=32, dropout=.2, recurrent_dropout=.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

Using TensorFlow backend.


In [None]:
model.fit(X_train_pad, y_train, batch_size=128, epochs=25, validation_data=(X_test_pad, y_test), verbose=2)

Train on 6152 samples, validate on 3031 samples
Epoch 1/25
