In [12]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [13]:
df = pd.read_csv('uncleanedds.csv')

In [14]:
df = df.drop(columns = ["Unnamed: 0"])
df = df[~df['full_text'].str.startswith('RT')]
df = df[~df['full_text'].str.startswith('LIVE')]
df = df[~df['full_text'].str.startswith('ICYMI')]
df = df[~df['full_text'].str.startswith('WATCH')]
df['no_punc_text'] = df['full_text']

In [15]:
def removeatusernames(tweet):
    return re.sub(r'(@[A-Za-z0-9]+)','',tweet)

In [16]:
def removelinks(tweet):
    return re.sub(r'http\S+','',tweet)

In [17]:
def removehashtags(tweet):
    return re.sub(r'(#[A-Za-z0-9]+)','',tweet)

In [18]:
df['full_text'] = df['full_text'].apply(removelinks).apply(removeatusernames).apply(removehashtags)

In [19]:
df = df[['full_text']]
df.head()

Unnamed: 0,full_text
0,real protective wit my soul where u been
1,what’s your fav lyric 🌪🌬
2,we do but it’s both u feel me
3,we love u 🖤 thank u 🖤 !! ヽ( ⌒o⌒)人(⌒○⌒ )ﾉ
4,u deserve the world ! only up from here chicoo...


In [20]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
porter = PorterStemmer()
from nltk.tokenize import sent_tokenize, word_tokenize
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [21]:
def normalized(arr):
    return (arr - min(arr)) / (max(arr)-min(arr))

In [32]:
def labeler_vader(text):
    analyzer = SentimentIntensityAnalyzer()
    score = analyzer.polarity_scores(text)
    return score['compound']

In [23]:
def labeler(text):
    analysis = TextBlob(text)
    return analysis.polarity

In [24]:
df['Score'] = df['full_text'].apply(labeler)
df['Normalized'] = normalized(df['Score'])

In [25]:
df.head()

Unnamed: 0,full_text,Score,Normalized
0,real protective wit my soul where u been,0.2,0.6
1,what’s your fav lyric 🌪🌬,0.25,0.625
2,we do but it’s both u feel me,0.0,0.5
3,we love u 🖤 thank u 🖤 !! ヽ( ⌒o⌒)人(⌒○⌒ )ﾉ,0.78125,0.890625
4,u deserve the world ! only up from here chicoo...,0.0,0.5


# Stemming and Getting the Score

In [28]:
stem_df = df['full_text'].apply(stemSentence)
stem_df = stem_df.to_frame()
stem_df['Score'] = df['full_text'].apply(labeler)
stem_df['Normalized'] = normalized(df['Score'])

In [29]:
stem_df.head()

Unnamed: 0,full_text,Score,Normalized
0,real protect wit my soul where u been,0.2,0.6
1,what ’ s your fav lyric 🌪🌬,0.25,0.625
2,we do but it ’ s both u feel me,0.0,0.5
3,we love u 🖤 thank u 🖤 ! ! ヽ ( ⌒o⌒ ) 人 ( ⌒○⌒ ) ﾉ,0.78125,0.890625
4,u deserv the world ! onli up from here chicooo...,0.0,0.5


## Stemming Using Vader

In [33]:
vader_df = df['full_text'].apply(stemSentence)
vader_df = vader_df.to_frame()
vader_df['Score'] = df['full_text'].apply(labeler_vader)
# stem_df['Normalized'] = normalized(df['Score'])
vader_df.head()

Unnamed: 0,full_text,Score
0,real protect wit my soul where u been,0.0
1,what ’ s your fav lyric 🌪🌬,0.4588
2,we do but it ’ s both u feel me,0.0
3,we love u 🖤 thank u 🖤 ! ! ヽ ( ⌒o⌒ ) 人 ( ⌒○⌒ ) ﾉ,0.8065
4,u deserv the world ! onli up from here chicooo...,0.0


# Train Test Split

In [34]:
from sklearn.model_selection import train_test_split
X = stem_df['full_text']
y = stem_df['Normalized']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Making the Feature

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
word_vec = TfidfVectorizer()
word_vec.fit(X_train)
X_train_feat = word_vec.transform(X_train)
X_test_feat = word_vec.transform(X_test)

# Testing

In [36]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=.1)
clf.fit(X_train_feat, y_train)
clf.score(X_train_feat, y_train)

0.887478597602573

In [37]:
def automatic(X_train_feat, y_train, X_test_feat, y_test, best, model):
    
#     clf = Ridge(alpha=best)
    model.fit(X_train_feat, y_train)
    pred = model.predict(X_test_feat)
    result = X_test.to_frame()
    result['True'] = y_test
    result['Predicted'] = pred
    
    def difference(true, pred):
        return round(np.abs(true-pred), 3)
    
    def label(diff):
        if diff > .1:
            return 0
        else:
            return 1
    
    result['Difference'] = difference(result['True'], result['Predicted'])
    result['Label Different'] = result['Difference'].apply(label)
    return sum(result['Label Different'])/len(result)

In [38]:
for i in np.linspace(0, 1, 10):
    clf = Ridge(alpha=i)
    clf.fit(X_train_feat, y_train)
    score = automatic(X_train_feat, y_train, X_test_feat, y_test, i, clf)
    print("Alpha " + str(i) + " " + str(score))

Alpha 0.0 0.6744692433315188
Alpha 0.1111111111111111 0.793685356559608
Alpha 0.2222222222222222 0.8170930865541643
Alpha 0.3333333333333333 0.829613500272183
Alpha 0.4444444444444444 0.8285247686445292
Alpha 0.5555555555555556 0.833968426782798
Alpha 0.6666666666666666 0.8366902558519325
Alpha 0.7777777777777777 0.8399564507348939
Alpha 0.8888888888888888 0.8415895481763745
Alpha 1.0 0.8405008165487208


In [39]:
clf = Ridge(alpha=.9)
clf.fit(X_train_feat, y_train)
automatic(X_train_feat, y_train, X_test_feat, y_test, .7, clf)

0.8426782798040283

# Making a Pickle

## Vector Pickle

In [82]:
pickle_out = open("word_vec.pickle","wb")
pickle.dump(word_vec, pickle_out)
pickle_out.close()

In [83]:
pickle_in = open("word_vec.pickle", 'rb')
vec = pickle.load(pickle_in)

## Ridge Regression Pickle

In [85]:
pickle_out = open("ridge_reg.pickle","wb")
pickle.dump(clf, pickle_out)
pickle_out.close()

In [86]:
pickle_in = open("ridge_reg.pickle", 'rb')
ridge_reg = pickle.load(pickle_in)

## NLTK Pickle

In [88]:
pickle_out = open("nltk.pickle","wb")
pickle.dump(stemSentence, pickle_out)
pickle_out.close()

In [89]:
pickle_in = open("nltk.pickle", 'rb')
nltk_func = pickle.load(pickle_in)

In [90]:
nltk_func("Coming soon")

'come soon '

In [4]:
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
porter = PorterStemmer()
from nltk.tokenize import sent_tokenize, word_tokenize
def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

In [5]:
pickle_out = open("porter.pickle","wb")
pickle.dump(porter, pickle_out)
pickle_out.close()

In [7]:
word_token = word_tokenize

In [8]:
pickle_out = open("word_token.pickle","wb")
pickle.dump(word_token, pickle_out)
pickle_out.close()