In [10]:
#Stanford Data for Sentiment Analysis
#http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

In [30]:
import os
import pandas as pd
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [2]:
path_for_dataset = r'C:\Users\smattoo5\Desktop\Python Work Book\TwitterSentimentAnalysis_Flask_Pipeline\Hate-Speech-Classification-deployed-using-Flask\dataset'

In [8]:
filename = path_for_dataset+"//twitter_sentiments.csv"
data = pd.read_csv(filename)

In [9]:
data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [12]:
data['len'] = [len(t) for t in data.tweet]
data.head()

Unnamed: 0,id,label,tweet,len
0,1,0,@user when a father is dysfunctional and is s...,102
1,2,0,@user @user thanks for #lyft credit i can't us...,122
2,3,0,bihday your majesty,21
3,4,0,#model i love u take with u all the time in ...,86
4,5,0,factsguide: society now #motivation,39


In [None]:
# Pre-Processing of Tweets

In [13]:
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))

In [21]:
from bs4 import BeautifulSoup
import re

In [22]:
def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()

In [24]:
data['cleantext'] = data['tweet'].apply(lambda x: tweet_cleaner(x))
data['newlen'] = data['cleantext'].apply(lambda x: len(x))

In [25]:
data.head()

Unnamed: 0,id,label,tweet,len,cleantext,newlen
0,1,0,@user when a father is dysfunctional and is s...,102,when a father is dysfunctional and is so selfi...,91
1,2,0,@user @user thanks for #lyft credit i can't us...,122,thanks for lyft credit i can t use cause they ...,103
2,3,0,bihday your majesty,21,bihday your majesty,19
3,4,0,#model i love u take with u all the time in ...,86,model i love u take with u all the time in ur,45
4,5,0,factsguide: society now #motivation,39,factsguide society now motivation,33


In [26]:
# Create Corpus

In [27]:
Corpus=[]

In [28]:
Corpus = [each for each in data['cleantext']]

In [29]:
Corpus

['when a father is dysfunctional and is so selfish he drags his kids into his dysfunction run',
 'thanks for lyft credit i can t use cause they don t offer wheelchair vans in pdx disapointed getthanked',
 'bihday your majesty',
 'model i love u take with u all the time in ur',
 'factsguide society now motivation',
 'huge fan fare and big talking before they leave chaos and pay disputes when they get there allshowandnogo',
 'camping tomorrow danny',
 'the next school year is the year for exams can t think about that school exams hate imagine actorslife revolutionschool girl',
 'we won love the land allin cavs champions cleveland clevelandcavaliers',
 'welcome here i m it s so gr',
 'ireland consumer price index mom climbed from previous to in may blog silver gold forex',
 'we are so selfish orlando standwithorlando pulseshooting orlandoshooting biggerproblems selfish heabreaking values love',
 'i get to see my daddy today days gettingfed',
 'cnn calls michigan middle school build the wa

In [33]:
# train test split
train, test = train_test_split(data, test_size = 0.2, stratify = data['label'], random_state=21)

# get the shape of train and test split.
train.shape, test.shape

((25569, 6), (6393, 6))

In [34]:
#from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS, TfidfVectorizer
# create a TF-IDF vectorizer object
tfidf_vectorizer = TfidfVectorizer(lowercase= True, max_features=1000, stop_words=ENGLISH_STOP_WORDS)

In [36]:
tfidf_vectorizer.fit(Corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=1000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=frozenset({'a', 'about', 'above', 'across', 'after',
                                      'afterwards', 'again', 'against', 'all',
                                      'almost', 'alone', 'along', 'already',
                                      'also', 'although', 'always', 'am',
                                      'among', 'amongst', 'amoungst', 'amount',
                                      'an', 'and', 'another', 'any', 'anyhow',
                                      'anyone', 'anything', 'anyway',
                                      'anywhere', ...}),
                strip_accents=None, sublinear_tf=False,
                token_pa

In [37]:
train_idf = tfidf_vectorizer.transform(train.cleantext)
test_idf  = tfidf_vectorizer.transform(test.cleantext)

In [38]:
# Build Model RandomForest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [39]:
rfc.fit(train_idf, train.label)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [40]:
predict = rfc.predict(test_idf)

In [43]:
# Training Score
TrainingF1Score = f1_score(y_true= train.label, y_pred= rfc.predict(train_idf))
TrainingF1Score

0.9061566049013747

In [44]:
# Validation Score
TestingF1Score = f1_score(y_true= test.label, y_pred= rfc.predict(test_idf))
TestingF1Score

0.6067415730337079

# Build a Pipeline

In [47]:
# define the stages of the pipeline
pipeline = Pipeline(steps= [('tfidf', TfidfVectorizer(lowercase=True,
                                                      max_features=1000,
                                                      stop_words= ENGLISH_STOP_WORDS)),
                            ('model', RandomForestClassifier())])

# fit the pipeline model with the training data                            
pipeline.fit(train.cleantext, train.label)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=1000,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=frozenset({'a', 'about', 'above',
                                                       'across', 'after',
                                                       'afterward...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                               

In [63]:
# sample tweet
text = ['omfg i m offended i m a mailbox and i m proud mailboxpride liberalisme']

In [64]:
pipeline.predict(text)

array([0], dtype=int64)

In [66]:
from joblib import dump

# dump the pipeline model
dump(pipeline, filename="text_classification.joblib")

['text_classification.joblib']

In [68]:
#!pip install tweepy

Collecting tweepy
  Downloading tweepy-3.8.0-py2.py3-none-any.whl (28 kB)
Collecting requests-oauthlib>=0.7.0
  Using cached requests_oauthlib-1.3.0-py2.py3-none-any.whl (23 kB)
Collecting oauthlib>=3.0.0
  Using cached oauthlib-3.1.0-py2.py3-none-any.whl (147 kB)
Installing collected packages: oauthlib, requests-oauthlib, tweepy
Successfully installed oauthlib-3.1.0 requests-oauthlib-1.3.0 tweepy-3.8.0


In [69]:
# Connect Model and HTML Page

In [71]:
# importing the required libraries
from flask import Flask, render_template, request, redirect, url_for
from joblib import load
#from get_tweets import get_related_tweets

In [72]:
# load the pipeline object
pipeline = load("text_classification.joblib")

In [73]:
#Use Flask