## Importing necessary libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
pd.options.display.max_columns = None
pd.set_option('display.max_colwidth', -1)

from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

ModuleNotFoundError: No module named '_plotly_utils'

#### Chart studio helps to embed interactive plotly graphs in platforms outside jupyter

In [None]:
import chart_studio
import chart_studio.plotly as py
import chart_studio.tools as tls

- Setting up the credentials from Plotly

In [None]:
tls.set_credentials_file(username='IshaGulati',api_key='Kk4iDM0uYjM8PuTV2I20')

#### Text preprocessing libraries

In [None]:
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator


import nltk
from nltk.corpus import stopwords

from tqdm import tqdm
import os
import nltk
import spacy
import random
from spacy.util import compounding
from spacy.util import minibatch
import re
import string
import codecs
import unidecode


import warnings
warnings.filterwarnings("ignore")


### Function to generate random colours
#### Can be used by passing number of colours needed

In [None]:
def random_colours(number_of_colors):
    '''
    Simple function for random colours generation.
    Input:
        number_of_colors - integer value indicating the number of colours which are going to be generated.
    Output:
        Color in the following format: ['#E86DA4'] .
    '''
    colors = []
    for i in range(number_of_colors):
        colors.append("#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)]))
    return colors

In [None]:
train_data = pd.read_csv('../data/train.csv',encoding='utf-8')

In [None]:
test_data = pd.read_csv('../data/test.csv',encoding='utf-8')

In [None]:
test_data.shape

In [None]:
train_data.head()

In [None]:
train_data.loc[0:15,'tweet']

In [None]:
train_data.shape

In [None]:
test_data.isnull().sum()

In [None]:
train_data.isnull().sum()

- We will remove the only null row present

In [None]:
train_data[train_data.isnull().any(axis=1)]

In [None]:
train_data.dropna(inplace=True)

In [None]:
train_data.shape

In [None]:
train_data.describe()

## Adding Sentiment text for better visualizations
### We have done dictionary mapping to reflect the textual meaning of the sentiment classes
* 0: Negative
* 1: Neutral
* 2: Positive
* 3: Can't Tell

In [None]:
sentiment_dict = {0:'Negative',1:'Neutral',2:'Positive',3:"""Can't tell"""}
train_data['vis_sentiment'] = train_data['sentiment'].map(sentiment_dict)
train_data.head()

In [None]:
temp_vis = train_data.groupby('vis_sentiment').count()['tweet'].reset_index().sort_values(by='tweet',ascending=False)
temp_vis.style.background_gradient(cmap='Blues')

### Univariate Analysis of the Sentiment Data

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(x='vis_sentiment',data=train_data)

In [None]:
funnel = go.Figure(go.Funnelarea(
    text =temp_vis.vis_sentiment,
    values = temp_vis.tweet,
    title = {"position": "top center"}
    ))
funnel.show()

- We see that more than half of the tweets are classified as Neutral tweets
- Less than 10% of tweets are in the negative and can't tell categories

### Bivariate Analysis
- Character Counts for Positive and Negative tweets

In [None]:
py.plot(funnel, filename='Funnel chart',auto_open=False)

In [None]:
fig, ax = plt.subplots(figsize=(12,7))
for label, group in train_data.groupby('sentiment'):
    sns.distplot(group['tweet'].str.len(), label=str(label), ax=ax)
plt.xlabel('# of characters')
plt.ylabel('density')
plt.legend()
sns.despine()

In [None]:
train_data['tweet_len']= train_data['tweet'].apply(len)
data = [
    go.Box(
        y=train_data[train_data['sentiment']==0]['tweet_len'],
        name='Negative'
    ),
    go.Box(
        y=train_data[train_data['sentiment']==2]['tweet_len'],
        name='Positive'
    ),
    go.Box(
        y=train_data[train_data['sentiment']==1]['tweet_len'],
        name = 'Neutral')
]
layout = go.Layout(
    title = 'Comparison of character count in Tweets '
)
char_box = go.Figure(data=data, layout=layout)
char_box.show()

In [None]:
train_data['word_count']= train_data['tweet'].apply(lambda x: len(str(x).split()))

In [None]:
train_data.head()

In [None]:
train_data['tweet_len']= train_data['tweet'].apply(len)
data = [
    go.Box(
        y=train_data[train_data['sentiment']==0]['word_count'],
        name='Negative'
    ),
    go.Box(
        y=train_data[train_data['sentiment']==2]['word_count'],
        name='Positive'
    ),
    go.Box(
        y=train_data[train_data['sentiment']==1]['word_count'],
        name = 'Neutral')
]
layout = go.Layout(
    title = 'Comparison of word count in Tweets '
)
word_count_box = go.Figure(data=data, layout=layout)
word_count_box.show()

In [None]:
py.plot(word_count_box, filename='Word_count_box',auto_open=False)

### Twitter Character Count : 280 characters since 2017, earlier it was 140 characters 
##### FOR THIS PARTICULAR DATASET WE CONSIDER 140 CHARACTERS AS THE DATA HAS DATES FROM 2011.
What is Counted:

   - Any character in the text of your post, including spaces
   - Emojis (1 emoji registers as 2 characters)
   - Hashtags
   - Twitter handles (when mentioning an account)
   - Links*

What is not Counted:

   - Visual content (images, GIFs, and videos)
   - Polls
   - Quote Tweets
   - Twitter handles (only when you are replying to a Tweet)


# Cleaning the corpus

* *We see that there are common hastags such as #sxsw and #'?sxsw?' present in almost every tweet, we can remove them since they might not help us differentiate b/w sentiments*
* The user handles for all the tweets have been replaced by **@mention**
* The retweets are identified by character **RT**, hence these can be removed as well
* The url's in tweets have been replaced by **{link}** and so these can be removed
* Other cleaning steps involve removal of non aplhabets (digits, special symbols, punctuations


#### Post Note : though the cleaning helped us in getting more insights, the model was only well trained when all the twitter data was used in it's most raw form

In [None]:
train_data.loc[0:20,'tweet']

### BeautifulSoup - Decoding html to general text, will replace &amp and &quot to  & and " ", etc

In [None]:
from bs4 import BeautifulSoup
def remove_html_encodings(x):
    example1 = BeautifulSoup(x, 'lxml')
    return example1.get_text()

In [None]:
train_data["tweet"] = train_data["tweet"].apply(lambda x: remove_html_encodings(x))

In [None]:
train_data.head(20)

In [None]:
test_data['tweet'] = test_data['tweet'].apply(lambda x: remove_html_encodings(x))

In [None]:
test_data.head()

#### Removing all hashtags with SXSW/sxsw in it, as they are common to all

In [None]:
train_data =  train_data[train_data['sentiment'] != 3]

In [None]:

train_data['tweet'] = train_data['tweet'].apply(lambda x:re.sub('[^\s]*sxsw[^\s]*','',x,flags=re.IGNORECASE))

# train_data['hashtags'] = train_data['tweet'].str.findall(r'#.*?(?=\s|$)') #finding and seperating all hashtags into a seperate column


In [None]:
train_data.head()

In [None]:
test_data['tweet'] = test_data['tweet'].apply(lambda x:re.sub('[^\s]*sxsw[^\s]*','',x,flags=re.IGNORECASE))
test_data.head()

### Extracting Hashtags
- Extracting all other hashtags for EDA, before we clean the data ahead

In [None]:
def hashtag_extract(x):
    hashtags = []
    x=x.str.lower()
    # Loop over the words in the tweet
    for i in x:
        ht = re.findall(r"#(\w+)", i)
        hashtags.append(ht)

    return hashtags

In [None]:
positive_hashtags = hashtag_extract(train_data['tweet'][train_data['sentiment'] == 2])


negative_hashtags = hashtag_extract(train_data['tweet'][train_data['sentiment'] == 0])
neutral_hashtags = hashtag_extract(train_data['tweet'][train_data['sentiment'] == 1])
total_hashtags = hashtag_extract(train_data['tweet'])
# unnesting list
HT_positive = sum(positive_hashtags,[])
HT_negative = sum(negative_hashtags,[])
HT_neutral = sum(neutral_hashtags,[])
HT_total= sum(total_hashtags,[])

In [None]:
positive_hashtags

In [None]:
a = nltk.FreqDist(HT_total)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 6) 
# plt.figure(figsize=(16,5))
# ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
# ax.set(ylabel = 'Count')
# plt.title('Count chart for All Hashtags')
# plt.show()
colors = ['#636efa','#ef553b','#00cc96','#ab63fa','#ffa15a','#19d3f3']
# colors[1] = 'crimson'

all_hashtags = go.Figure()
all_hashtags.add_trace(go.Bar(x=d.Hashtag,
    y=d.Count,
    name='All Hashtags',
    marker_color=colors
))
# all_hashtags = px.bar(d, x='Hashtag', y='Count',color ='Count')
# all_hashtags.show()

In [None]:
py.plot(all_hashtags, filename='All Hashtags',auto_open=False)

In [None]:
a = nltk.FreqDist(HT_positive)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 6) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.title('Count chart for Positive Hashtags')
plt.show()

In [None]:
colors = ['#636efa','#ef553b','#00cc96','#ab63fa','#ffa15a','#19d3f3']
# colors[1] = 'crimson'

positive_hashtags = go.Figure()
positive_hashtags.add_trace(go.Bar(x=d.Hashtag,
    y=d.Count,
    name='Positive Hashtags',
    marker_color=colors
))
positive_hashtags.update_layout(title_text='Positive Hashtags')

In [None]:
py.plot(positive_hashtags, filename='Positive Hashtags',auto_open=False)

In [None]:
a = nltk.FreqDist(HT_negative)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 6) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.title('Count chart for Negative Hashtags')
plt.show()

In [None]:
colors = ['#636efa','#ef553b','#00cc96','#ab63fa','#ffa15a','#19d3f3']
# colors[1] = 'crimson'

negative_hashtags = go.Figure()
negative_hashtags.add_trace(go.Bar(x=d.Hashtag,
    y=d.Count,
    name='Negative Hashtags',
    marker_color=colors
))
negative_hashtags.update_layout(title_text='Negative Hashtags')

In [None]:
py.plot(negative_hashtags, filename='Negative Hashtags',auto_open=False)

In [None]:
a = nltk.FreqDist(HT_neutral)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 6) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.title('Count chart for Neutral Hashtags')
plt.show()

In [None]:
colors = ['#636efa','#ef553b','#00cc96','#ab63fa','#ffa15a','#19d3f3']
# colors[1] = 'crimson'

neutral_hashtags = go.Figure()
neutral_hashtags.add_trace(go.Bar(x=d.Hashtag,
    y=d.Count,
    name='Neutral Hashtags',
    marker_color=colors
))
neutral_hashtags.update_layout(title_text='Neutral Hashtags')

In [None]:
py.plot(neutral_hashtags, filename='Neutral Hashtags',auto_open=False)

In [None]:
# Creating a dictionary of contractions for contraction to expansion

contraction_mapping = {"aight" : "alright",
 "ain't": "am not",
 "amn't" : "am not",
 "aren't": "are not",
 "can't": "cannot",
 "'cause" : "because",
 "could've": "could have",
 "couldn't" : "could not",
 "couldn't've" : "could not have", 
 "daren't" : "dare not",
 "daresn't" : "dare not",
 "dasn't" : "dare not",
 "didn't" : "did not",
 "doesn't" : "does not",
 "don't" : "do not",
 "d'ye" : "do you",
 "e'er" : "ever",
 "everybody's" : "everybody is",
 "everyone's" : "everyone is",
 "finna":"fixing to",
 "g'day" : "good day",
 "gimme" : "give me",
 "giv'n": "given",
 "gonna":"going to",
 "gon't":"go not",
 "gotta":"got to",
 "hadn't":"had not",
 "had've":"had have",
 "hasn't":"has not",
 "haven't":"have not",
 "he'd":"he would",
 "he'dn't've'd":"he would not have had",
 "he'll":"he will",
 "he's":"he is",
 "he've":"he have",
 "how'd":"how did",
 "howdy":"how do you do",
 "how'll":"how will",
 "how're":"how are",
 "how's":"how has",
 "i'd": "i would",
 "i'd've":"i would have",
 "i'll": "i will",
 "i'm": "i am",
 "i'm'a": "i am about to",
 "i'm'o": "i am going to",
 "innit": "is it not",
 "i've": "i have",
 "isn't": "is not",
 "it'd": "it would",
 "it'll": "it will",
 "it's": "it is",
 "let's": "let us", 
 "ma'am": "madam",
 "mayn't": "may not",
 "may've": "may have",
 "methinks" : "me thinks",
 "mightn't": "might not",
 "might've": "might have",
 "mustn't": "must not",
 "mustn't've": "must not have",
 "must've": "must have",
 "needn't": "need not",
 "ne'er":"never",
 "o'clock": "of the clock",
 "o'er": "over",
 "ol'": "old",
 "oughtn't":"ought not",
 "'s": "is",
 "shalln't":"shall not",
 "shan't":"shall not",
 "she'd":"she would",
 "she'll":"she will",
 "she's":"she is",
 "should've":"should have",
 "shouldn't":"should not",
 "shouldn't've":"should not have",
 "somebody's":"somebody is",
 "someone's":"someone is",
 "something's":"something is",
 "so're":"so you are",
 "that'll":"that will",
 "that're":"that are",
 "that's":"that is",
 "that'd":"that had",
 "there'd":"there would",
 "there'll":"here shall",
 "there're":"there are",
 "there's":"there has",
 "these're":"these are",
 "these've":"these have",
 "they'd":"they would",
 "they'll":"they will",
 "they're":"they are",
 "they've":"they have",
 "this's":"this is",
 "those're":"those are",
 "those've":"those have",
 "'tis":"it is",
 "to've":"to have",
 "'twas":"it was",
 "wanna":"want to",
 "wasn't":"was not",
 "we'd":"we would",
 "we'd've":"we would have",
 "we'll":"we will",
 "we're":"we are",
 "we've":"we have",
 "weren't":"were not",
 "what'd":"what did",
 "what'll":"what will",
 "what're":"what are",
 "what's":"what is",
 "what've":"what have",
 "when's":"when is",
 "where'd":"where did",
 "where'll":"where will",
 "where're":"where are",
 "where's":"where has",
 "where've":"where have",
 "which'd":"which had",
 "which'll":"which shall",
 "which're":"which are",
 "which's":"which has",
 "which've":"which have",
 "who'd":"who would",
 "who'd've":"who would have",
 "who'll":"who will",
 "who're":"who are",
 "who's":"who has",
 "who've":"who have",
 "why'd":"why did",
 "why're":"why are",
 "why's":"why is",
 "won't":"will not",
 "would've":"would have",
 "wouldn't":"would not",
 "wouldn't've":"would not have",
 "y'all":"you all",
 "y'all'd've":"you all would have",
 "y'all'dn't've'd":"you all would not have had",
 "y'all're":"you all are",
 "you'd":"you would",
 "you'll":"you will",
 "you're":"you are",
 "you've":"you have",
  " u " : " you",
 " ur " : " your",
 " n ": " and ",
 " w/ " : " with ",
 " apples ": " apple is "}

In [None]:
def clean_text(text):
    try:
        decoded = unidecode.unidecode(codecs.decode(text, 'unicode_escape'))
    except:
        decoded = unidecode.unidecode(text)
#     print(text)
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = re.sub("’", "'", decoded)
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")])
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text) #""" removes any words in square brackets"""
    text = re.sub('{link}', '', text) #"""removes {link} from text """
    text = re.sub('@mention','',text) #removes user handles
    text = re.sub('rt','',text) #removes RT as string
    text = re.sub('<.*?>+', '', text) #"""removes any words in <___> """
#     text = re.sub('[^\s]*sxsw[^\s]*','',text) #removes all strings with sxsw
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) #"""removes punctuations """
    
    text = re.sub('\n', '', text) #removes line breaks
    text = re.sub('\w*\d\w*', '', text) #removes words with digits
#     text = re.sub('[0-9a-zA-Z]*[^\s0-9a-zA-Z]+[0-9a-zA-Z]*','',text)
    text = re.sub(r'(.)\1+', r'\1\1', text) #spell_corrected
    text = text.replace('\r','')
    return text

In [None]:
train_data['cleaned_text'] = train_data['tweet'].apply(lambda x:clean_text(x))
train_data['cleaned_text'] = train_data['cleaned_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
train_data.loc[0:20,'cleaned_text']

In [None]:
test_data['cleaned_text'] = test_data['tweet'].apply(lambda x:clean_text(x))
test_data['cleaned_text'] = test_data['cleaned_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
test_data.loc[0:20,'cleaned_text']

In [None]:
#Applying NER after first level of cleaning

import spacy
from spacy.pipeline import EntityRuler
nlp = spacy.load("en_core_web_sm")
ruler = EntityRuler(nlp)

pattern = [{"label":"ORG", "pattern": "apple"},
           {"label":"ORG", "pattern": "google"},
           {"label":"ORG", "pattern": "facebook"},
           {"label":"ORG", "pattern": "amazon"},
           {"label":"ORG", "pattern": "microsoft"}]

ruler.add_patterns(pattern)
nlp.add_pipe(ruler)

import spacy
from spacy import displacy
nlp = spacy.load("en")

def ner(text):
  doc = nlp(text)
  return doc.ents

train_data["named_entity_1"] = train_data["tweet"].apply(lambda x : ([(word.text, word.label_) for word in ner(x)]))

In [None]:
train_data.head(20)

In [None]:
train_data["organizations"] = train_data["tweet"].apply(lambda x : ([(word.text) for word in ner(x) if word.label_ == "ORG"]))

In [None]:
import operator
dict_org = {}
def create_set(x):
    for i in x:
        if i in dict_org:
            dict_org[i] += 1
        else:
            dict_org[i] = 1
    
train_data["organizations"].apply(lambda x:create_set(x))
sorted(dict_org.items(), key=lambda x: x[1], reverse=True)

In [None]:
orgs = {'org_apple':['apple','iphone','ipads','ipad','iphones','itunes','ipad2','ios','mac','macos','macbook','ipod'],
        'org_google':['google','android','andoid','nexus'],
        'org_uber':['uber','uberguide'],
        'org_microsoft':['microsoft','bing','windows'],
         'org_facebook':['facebook']}

In [None]:
for key in orgs:
    train_data[key] = 0
train_data.head()
# train_data.drop(['Apple','Google','Uber','Microsoft','Facebook'],axis=1,inplace=True)

In [None]:
train_data.loc[0,'tweet']

In [None]:
for index,row in train_data.iterrows():
    for word in row['cleaned_text'].split():        
        for key in orgs:             
            if word in orgs[key]:
                train_data.loc[index,key] = 1
train_data

In [None]:
train_data["location"] = train_data["tweet"].apply(lambda x : ([(word.text) for word in ner(x) if word.label_ == "GPE"]))

In [None]:
train_data["person"] = train_data["tweet"].apply(lambda x : ([(word.text) for word in ner(x) if word.label_ == "PERSON"]))

In [None]:
train_data.head()

In [None]:
# import nltk
# nltk.download('averaged_perceptron_tagger')

In [None]:

dict_location = {}
def create_location_set(x):
    for i in x:
        if i in dict_location:
            dict_location[i] += 1
        else:
            dict_location[i] = 1
    
train_data["location"].apply(lambda x:create_location_set(x))
sorted(dict_location.items(), key=lambda x: x[1], reverse=True)

In [None]:

dict_person = {}
def create_person_set(x):
    for i in x:
        if i in dict_person:
            dict_person[i] += 1
        else:
            dict_person[i] = 1
    
train_data["person"].apply(lambda x:create_person_set(x))
sorted(dict_person.items(), key=lambda x: x[1], reverse=True)

In [None]:
total=[]
positive=[]
negative=[]
neutral=[]
for key in orgs:
    count_total = train_data[train_data[key]==1][key].sum()
    count_positive = train_data[(train_data[key]==1) & (train_data['sentiment']==2)][key].sum()
    count_negative = train_data[(train_data[key]==1) & (train_data['sentiment']==0)][key].sum()
    count_neutral = train_data[(train_data[key]==1) & (train_data['sentiment']==1)][key].sum()
    total.append(count_total)
    positive.append(count_positive)
    negative.append(count_negative)
    neutral.append(count_neutral)
x = ['Apple','Google','Uber','Microsoft','Facebook']
plot_org_data = pd.DataFrame(list(zip(x, total,positive,negative,neutral)),columns=['Organisation','Tweet_Count','Positive_Count','Negative_Count','Neutral_Count'])
    

In [None]:
plot_org_data = plot_org_data.sort_values(by='Tweet_Count',ascending=False).reset_index()

In [None]:
plot_org_data.drop('index',axis=1,inplace=True)

In [None]:
plot_org_data

In [None]:
tweet_by_org = go.Figure(data=[
    go.Bar(x=plot_org_data['Organisation'], y=plot_org_data['Tweet_Count'],),
])
# Change the bar mode
tweet_by_org.update_layout(barmode='group')
tweet_by_org.show()

In [None]:
py.plot(tweet_by_org, filename='Organisation_tweet_count',auto_open=False)

In [None]:
plot_org_data_t = plot_org_data.transpose()

In [None]:
plot_org_data_t.columns = plot_org_data_t.iloc[0]

In [None]:
plot_org_data_t = plot_org_data_t.iloc[1:]

In [None]:
plot_org_data_t.index

In [None]:
plot_org_data_t = plot_org_data_t.iloc[1:]

In [None]:
tweet_by_sent_org = go.Figure(data=[
    go.Bar(name='Neutral', x=plot_org_data['Organisation'], y=plot_org_data['Neutral_Count']),
    go.Bar(name='Positive', x=plot_org_data['Organisation'], y=plot_org_data['Positive_Count']),
    go.Bar(name='Negative', x=plot_org_data['Organisation'], y=plot_org_data['Negative_Count'])
])
# Change the bar mode
tweet_by_sent_org.update_layout(barmode='group')
tweet_by_sent_org.show()

In [None]:
py.plot(tweet_by_sent_org, filename='Organisation wise Sentiment Count',auto_open=False)

In [None]:
apple_pie = px.pie(plot_org_data_t, values='Apple', names=plot_org_data_t.index, title='Apple')
apple_pie.show()

In [None]:
py.plot(apple_pie, filename='Sentiment Distribution for Apple',auto_open=False)

In [None]:
google_pie = px.pie(plot_org_data_t, values='Google', names=plot_org_data_t.index, title='Google')
google_pie.show()

In [None]:
py.plot(google_pie, filename='Sentiment Distribution for Google',auto_open=False)

In [None]:
microsoft_pie = px.pie(plot_org_data_t, values='Microsoft', names=plot_org_data_t.index, title='Microsoft')
microsoft_pie.show()

In [None]:
py.plot(microsoft_pie, filename='Sentiment Distribution for Microsoft',auto_open=False)

In [None]:
uber_pie = px.pie(plot_org_data_t, values='Uber', names=plot_org_data_t.index, title='Uber')
uber_pie.show()

In [None]:
py.plot(uber_pie, filename='Sentiment Distribution for Uber',auto_open=False)

In [None]:
facebook_pie = px.pie(plot_org_data_t, values='Facebook', names=plot_org_data_t.index, title='Facebook')
facebook_pie.show()

In [None]:
py.plot(facebook_pie, filename='Sentiment Distribution for Facebook',auto_open=False)

In [None]:
train_data['temp_list'] = train_data['cleaned_text'].apply(lambda x:str(x).split())
top = Counter([item for sublist in train_data['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(20))
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Blues')

In [None]:
def remove_stopword(x):
    stopwords = set(STOPWORDS)
    more_stopwords = {'amp', "quot","via","will"}
    stopwords = stopwords.union(more_stopwords)
    return [y for y in x if y not in stopwords]
train_data['temp_list'] = train_data['temp_list'].apply(lambda x:remove_stopword(x))

In [None]:
train_data.head()

In [None]:
top = Counter([item for sublist in train_data['temp_list'] for item in sublist])
temp = pd.DataFrame(top.most_common(100))
temp = temp.iloc[1:,:]
temp.columns = ['Common_words','count']
temp.style.background_gradient(cmap='Purples')

In [None]:
tree_common = px.treemap(temp.head(20), path=['Common_words'], values='count',title='Tree of Most Common Words')
tree_common.show()

In [None]:
py.plot(tree_common, filename='Most Common Words',auto_open=False)

In [None]:
Negative_sent = train_data[train_data['sentiment']==0]
Positive_sent = train_data[train_data['sentiment']==2]
Neutral_sent = train_data[train_data['sentiment']==1]
# Canttell_sent = train_data[train_data['sentiment']==3]

In [None]:
top = Counter([item for sublist in Positive_sent['temp_list'] for item in sublist])
temp_positive = pd.DataFrame(top.most_common(100))
temp_positive.columns = ['Common_words','count']
temp_positive.style.background_gradient(cmap='Greens')

In [None]:
bar_pos_common = px.bar(temp_positive.head(20), x="count", y="Common_words", title='Most Commmon Positive Words', orientation='h', 
             width=700, height=700,color='Common_words')
bar_pos_common.show()

In [None]:
py.plot(bar_pos_common, filename='Positive Common Words',auto_open=False)

In [None]:
tree_pos_common = px.treemap(temp_positive.head(20), path=['Common_words'], values='count',title='Tree Of Most Common Positive Words')
tree_pos_common.show()

In [None]:
py.plot(tree_pos_common, filename='Positive Common Words Tree',auto_open=False)

In [None]:
top = Counter([item for sublist in Negative_sent['temp_list'] for item in sublist])
temp_negative = pd.DataFrame(top.most_common(100))
temp_negative = temp_negative.iloc[1:,:]
temp_negative.columns = ['Common_words','count']
temp_negative.style.background_gradient(cmap='Reds')

In [None]:
bar_neg_common = px.bar(temp_negative.head(20), x="count", y="Common_words", title='Most Commmon Negative Words', orientation='h', 
             width=700, height=700,color='Common_words')
bar_neg_common.show()

In [None]:
py.plot(bar_neg_common, filename='Negative Common Words',auto_open=False)

In [None]:
tree_neg_common = px.treemap(temp_negative.head(20), path=['Common_words'], values='count',title='Tree Of Most Common Negative Words')
tree_neg_common.show()

In [None]:
py.plot(tree_neg_common, filename='Negative Common Words Tree',auto_open=False)

In [None]:
#MosT common Neutral words
top = Counter([item for sublist in Neutral_sent['temp_list'] for item in sublist])
temp_neutral = pd.DataFrame(top.most_common(100))
temp_neutral = temp_neutral.loc[1:,:]
temp_neutral.columns = ['Common_words','count']
temp_neutral.style.background_gradient(cmap='Reds')

In [None]:
bar_neu_common = px.bar(temp_neutral.head(20), x="count", y="Common_words", title='Most Commmon Neutral Words', orientation='h', 
             width=700, height=700,color='Common_words')
bar_neu_common.show()

In [None]:
py.plot(bar_neu_common, filename='Neutral Common Words',auto_open=False)

In [None]:
tree_neu_common = px.treemap(temp_neutral.head(20), path=['Common_words'], values='count',title='Tree Of Most Common Neutral Words')
tree_neu_common.show()

In [None]:
py.plot(tree_neu_common, filename='Neutral Common Words Tree',auto_open=False)

### We will now generate bigrams and trigrams to see what phrases and words were used in Positive and Negative Tweete

In [None]:
def generate_ngrams (text,n=1):
    token = [token for token in text.lower().split(' ') if token != '' if token not in STOPWORDS]
    ngrams= zip(*[token[i:] for i in range(n)])
    return[' '.join(ngram) for ngram in ngrams]
N=20

In [None]:
from collections import defaultdict
positive_bigrams = defaultdict(int)
negative_bigrams = defaultdict(int)

for tweet in Positive_sent['cleaned_text']:
    for word in generate_ngrams(tweet, n=2):
        positive_bigrams[word] += 1
        
for tweet in Negative_sent['cleaned_text']:
    for word in generate_ngrams(tweet, n=2):
        negative_bigrams[word] += 1
        
positive_bigrams_df = pd.DataFrame(sorted(positive_bigrams.items(), key=lambda x: x[1])[::-1])
negative_bigrams_df = pd.DataFrame(sorted(negative_bigrams.items(), key=lambda x: x[1])[::-1])


positive_bigrams_df = positive_bigrams_df.sort_values(by = 1,ascending=True)
negative_bigrams_df = negative_bigrams_df.sort_values(by = 1,ascending=True)

In [None]:

positive_trigrams = defaultdict(int)
negative_trigrams = defaultdict(int)

for tweet in Positive_sent['cleaned_text']:
    for word in generate_ngrams(tweet, n=3):
        positive_trigrams[word] += 1
        
for tweet in Negative_sent['cleaned_text']:
    for word in generate_ngrams(tweet, n=3):
        negative_trigrams[word] += 1
        
positive_trigrams_df = pd.DataFrame(sorted(positive_trigrams.items(), key=lambda x: x[1])[::-1])
negative_trigrams_df = pd.DataFrame(sorted(negative_trigrams.items(), key=lambda x: x[1])[::-1])

positive_trigrams_df = positive_trigrams_df.sort_values(by = 1,ascending=True)
negative_trigrams_df = negative_trigrams_df.sort_values(by = 1,ascending=True)

In [None]:
negative_bigrams_df

In [None]:
bar_positive_bigrams = px.bar(positive_bigrams_df.head(20), x=positive_bigrams_df[1].values[12016:12031], y=positive_bigrams_df[0].values[12016:12031], title='Positive Bigrams', orientation='h', color=positive_bigrams_df[0].values[12016:12031])
bar_positive_bigrams.show()

In [None]:
py.plot(bar_positive_bigrams, filename='Positive Bigrams',auto_open=False)

In [None]:
bar_negative_bigrams = px.bar(negative_bigrams_df.head(20), x=negative_bigrams_df[1].values[2939:2954], y=negative_bigrams_df[0].values[2939:2954], title='Negative Bigrams', orientation='h', color=negative_bigrams_df[0].values[2939:2954])
bar_negative_bigrams.show()

In [None]:
py.plot(bar_negative_bigrams, filename='Negative Bigrams',auto_open=False)

In [None]:
positive_trigrams_df

In [None]:
bar_positive_trigrams = px.bar(positive_bigrams_df.head(20), x=positive_trigrams_df[1].values[12343:12358], y=positive_trigrams_df[0].values[12343:12358], title='Positive Trigrams', orientation='h', color=positive_trigrams_df[0].values[12343:12358])
bar_positive_trigrams.show()

In [None]:
py.plot(bar_positive_trigrams, filename='Positive Trigrams',auto_open=False)

In [None]:
negative_trigrams_df

In [None]:
bar_negative_trigrams = px.bar(negative_trigrams_df.head(20), x=negative_trigrams_df[1].values[2771:2786], y=negative_trigrams_df[0].values[2771:2786], title='Negative Trigrams', orientation='h', color=negative_trigrams_df[0].values[2771:2786])
bar_negative_trigrams.show()

In [None]:
py.plot(bar_negative_trigrams, filename='Negative Trigrams',auto_open=False)

In [None]:
# Plotting bigrams
fig, axes = plt.subplots(ncols=2, figsize=(18, 50), dpi=100)

plt.tight_layout()

sns.barplot(y=positive_bigrams_df[0].values[:30], x=positive_bigrams_df[1].values[:30], ax=axes[0], color='turquoise')
sns.barplot(y=negative_bigrams_df[0].values[:30], x=negative_bigrams_df[1].values[:30], ax=axes[1], color='orange')

for i in range(2):
    axes[i].spines['right'].set_visible(False)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].tick_params(axis='x', labelsize=20)
    axes[i].tick_params(axis='y', labelsize=20)

axes[0].set_title(f'Top {30} most common bigrams in Positive Tweets', fontsize=15)
axes[1].set_title(f'Top {30} most common bigrams in Negative Tweets', fontsize=15)

plt.show()

In [None]:
#Plotting trigrams
fig, axes = plt.subplots(ncols=2, figsize=(18, 50), dpi=100)

plt.tight_layout()

sns.barplot(y=positive_trigrams_df[0].values[:30], x=positive_trigrams_df[1].values[:30], ax=axes[0], color='turquoise')
sns.barplot(y=negative_trigrams_df[0].values[:30], x=negative_trigrams_df[1].values[:30], ax=axes[1], color='orange')

for i in range(2):
    axes[i].spines['right'].set_visible(False)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].tick_params(axis='x', labelsize=20)
    axes[i].tick_params(axis='y', labelsize=20)

axes[0].set_title(f'Top {30} most common trigrams in Positive Tweets', fontsize=15)
axes[1].set_title(f'Top {30} most common trigrams in Negative Tweets', fontsize=15)

plt.show()

In [None]:
raw_text = [word for word_list in train_data['temp_list'] for word in word_list]

In [None]:
raw_text

In [None]:
train_data.head()

In [None]:
def words_unique(sentiment,numwords,raw_words):
    '''
    Input:
        sentiment - Sentiment category (ex. 'Neutral');
        numwords - how many specific words do you want to see in the final result; 
        raw_words - list  for item in train_data[train_data.sentiment == sentiment]['temp_list']:
    Output: 
        dataframe giving information about the name of the specific ingredient and how many times it occurs in the chosen cuisine (in descending order based on their counts)..

    '''
    allother = []
    for item in train_data[train_data.sentiment != sentiment]['temp_list']:
        for word in item:
            allother .append(word)
    allother  = list(set(allother ))
    
    specificnonly = [x for x in raw_text if x not in allother]
    
    mycounter = Counter()
    
    for item in train_data[train_data.sentiment == sentiment]['temp_list']:
        for word in item:
            mycounter[word] += 1
    keep = list(specificnonly)
    
    for word in list(mycounter):
        if word not in keep:
            del mycounter[word]
    
    Unique_words = pd.DataFrame(mycounter.most_common(numwords), columns = ['words','count'])
    
    return Unique_words

In [None]:
Unique_Positive= words_unique(2, 100, raw_text)
print("The top 20 unique words in Positive Tweets are:")
Unique_Positive.head(20).style.background_gradient(cmap='Greens')

In [None]:
from palettable.colorbrewer.qualitative import Pastel1_7
top_20_positive = Unique_Positive.head(20)
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.pie(top_20_positive['count'], labels=top_20_positive.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('Plot Of Unique Positive Words')
plt.show()

In [None]:
Unique_Negative = words_unique(0, 100, raw_text)
print("The top 20 unique words in Negative Tweets are:")
Unique_Negative.head(20).style.background_gradient(cmap='Reds')

In [None]:
from palettable.colorbrewer.qualitative import Pastel1_7
top_20_negative = Unique_Negative.head(20)
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.pie(top_20_negative['count'], labels=top_20_negative.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('Plot Of Unique Negative Words')
plt.show()

In [None]:
Unique_Neutral= words_unique(1, 50, raw_text)
print("The top 10 unique words in Neutral Tweets are:")
Unique_Neutral.head(10).style.background_gradient(cmap='Oranges')

In [None]:
# from palettable.colorbrewer.qualitative import Pastel1_7
top_10_neutral = Unique_Neutral.head(10)
plt.figure(figsize=(16,10))
my_circle=plt.Circle((0,0), 0.7, color='white')
plt.pie(top_10_neutral['count'], labels=top_10_neutral.words, colors=Pastel1_7.hex_colors)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.title('DoNut Plot Of Unique Neutral Words')
plt.show()

In [None]:
def plot_wordcloud(text, mask, max_words=200, max_font_size=100, figure_size=(15,10), color = 'white',
                   title = None, title_size=40, image_color=False):
#     print(text)
    stopwords = set(STOPWORDS)
    more_stopwords = {"bitlyhmiiga",'scheen','spos', 'needing','filteraa', 'lanzara', 'ningun', 'producto','cst', 'youaare', 'zlf', 'sat','aaps','offersaa' }
    stopwords = stopwords.union(more_stopwords)

    wordcloud = WordCloud(background_color=color,
                    stopwords = stopwords,
                    max_words = max_words,
                    max_font_size = max_font_size, 
                    random_state = 42,
                    width=400, 
                    height=200,
                    mask = mask)
    wordcloud.generate(str(text))
    plt.figure(figsize=figure_size)
    if image_color:
        image_colors = ImageColorGenerator(mask);
        plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear");
        plt.title(title, fontdict={'size': title_size,  
                                  'verticalalignment': 'bottom'})
    else:
        plt.imshow(wordcloud);
        plt.title(title, fontdict={'size': title_size, 'color': 'black', 
                                  'verticalalignment': 'bottom'})
    plt.axis('off');
    plt.tight_layout()

In [None]:
flat_list = []
positive_hash_corpus = ''
for sublist in positive_hashtags:
    for item in sublist:
        flat_list.append(item)
positive_hash_corpus = ' '.join(flat_list)

In [None]:
positive_hash_corpus

In [None]:
d= '../data/masks-for-wordclouds/'
pos_mask = np.array(Image.open(d+ 'twitter_mask.png'))
plot_wordcloud(positive_hash_corpus,mask=pos_mask,color='white',max_font_size=100,title_size=30,title="WordCloud of Positive Hashtags")

In [None]:
flat_list = []
negative_hash_corpus = ''
for sublist in negative_hashtags:
    for item in sublist:
        flat_list.append(item)
negative_hash_corpus = ' '.join(flat_list)

In [None]:
d= '../data/masks-for-wordclouds/'
pos_mask = np.array(Image.open(d+ 'twitter_mask.png'))
plot_wordcloud(negative_hash_corpus,mask=pos_mask,color='white',max_font_size=100,title_size=30,title="WordCloud of Negative Hashtags")

In [None]:
flat_list = []
neutral_hash_corpus = ''
for sublist in neutral_hashtags:
    for item in sublist:
        flat_list.append(item)
neutral_hash_corpus = ' '.join(flat_list)

In [None]:
d= '../data/masks-for-wordclouds/'
pos_mask = np.array(Image.open(d+ 'twitter_mask.png'))
plot_wordcloud(neutral_hash_corpus,mask=pos_mask,color='white',max_font_size=100,title_size=30,title="WordCloud of Neutral Hashtags")

In [None]:
common_words = ''
for i in temp.Common_words:
#     print(i)
    common_words += "".join(i)+" "
common_words

In [None]:
d= '../data/masks-for-wordclouds/'
pos_mask = np.array(Image.open(d+ 'twitter_mask.png'))
plot_wordcloud(common_words,mask=pos_mask,color='white',max_font_size=200,title_size=30,title="WordCloud of Most Used Words")

In [None]:
common_pos_words = ''
for i in temp_positive.Common_words:
#     print(i)
    common_pos_words += "".join(i)+" "
common_pos_words

In [None]:
d= '../data/masks-for-wordclouds/'
pos_mask = np.array(Image.open(d+ 'twitter_mask.png'))
plot_wordcloud(common_pos_words,mask=pos_mask,color='white',max_font_size=200,title_size=30,title="WordCloud of Most Used Positive Words")

In [None]:
#Creating a corpus of unique
positive_words = ''
for i in Unique_Positive.words:
#     print(i)
    positive_words += "".join(i)+" "
positive_words

In [None]:
d= '../data/masks-for-wordclouds/'
pos_mask = np.array(Image.open(d+ 'twitter_mask.png'))
plot_wordcloud(positive_words,mask=pos_mask,color='white',max_font_size=200,title_size=30,title="WordCloud of Unique Positive Words")

In [None]:
common_neg_words = ''
for i in temp_negative.Common_words:
#     print(i)
    common_neg_words += "".join(i)+" "
common_neg_words

In [None]:
d= '../data/masks-for-wordclouds/'
pos_mask = np.array(Image.open(d+ 'twitter_mask.png'))
plot_wordcloud(common_neg_words,mask=pos_mask,color='white',max_font_size=200,title_size=30,title="WordCloud of Most Used Negative Words")

In [None]:
#Creating a corpus of unique
negative_words = ''
for i in Unique_Negative.words:
#     print(i)
    negative_words += "".join(i)+" "
negative_words

In [None]:
d= '../data/masks-for-wordclouds/'
pos_mask = np.array(Image.open(d+ 'twitter_mask.png'))
plot_wordcloud(negative_words,mask=pos_mask,color='white',max_font_size=200,title_size=30,title="WordCloud of Unique Negative Words")

In [None]:
common_neu_words = ''
for i in temp_neutral.Common_words:
#     print(i)
    common_neu_words += "".join(i)+" "
common_neu_words

In [None]:
d= '../data/masks-for-wordclouds/'
pos_mask = np.array(Image.open(d+ 'twitter_mask.png'))
plot_wordcloud(common_neu_words,mask=pos_mask,color='white',max_font_size=200,title_size=30,title="WordCloud of Most Used Neutral Words")

In [None]:
#Creating a corpus of common_words
neutral_words = ''
for i in Unique_Neutral.words:
#     print(i)
    neutral_words += "".join(i)+" "
neutral_words

In [None]:
d= '../data/masks-for-wordclouds/'
pos_mask = np.array(Image.open(d+ 'twitter_mask.png'))
plot_wordcloud(neutral_words,mask=pos_mask,color='white',max_font_size=100,title_size=30,title="WordCloud of Unique Neutral Words")

In [None]:
train_data = train_data[train_data['tweet_id'] != 5025]

In [None]:
train_data.shape

In [None]:
from gensim.utils import simple_preprocess
# Tokenize the text column to get the new column 'tokenized_text'
train_data['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in train_data['tweet']] 
print(train_data['tokenized_text'].head(10))

In [None]:
test_data['tokenized_text'] = [simple_preprocess(line,deacc=True) for line in test_data['tweet']]
print(test_data['tokenized_text'].head(10))

In [None]:
from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()
# Get the stemmed_tokens
train_data['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in train_data['tokenized_text'] ]
train_data['stemmed_tokens'].head(10)

test_data['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in test_data['tokenized_text']]
test_data['stemmed_tokens'].head(10)

In [None]:
# import packages
from nltk.corpus import stopwords
import nltk
from string import punctuation
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize

# stop_words = list(set(stopwords.words('english')))
# train_data['tokenized_data'] = train_data['cleaned_text'].apply(lambda row: word_tokenize(row))
# test_data['tokenized_data'] = test_data['cleaned_text'].apply(lambda row: word_tokenize(row))


# # stopword removal
# train_data['tokenized_data'] = train_data['tokenized_data'].apply(lambda row: [word for word in row if word not in stop_words])
# # train_data['tokenized_data'] = train_data['tokenized_data'].apply(lambda row: [])
# test_data['tokenized_data'] = test_data['tokenized_data'].apply(lambda row: [word for word in row if word not in stop_words])

# wordnet_lemmatizer = WordNetLemmatizer()

porter = PorterStemmer()

# stemming words
# stemmer = PorterStemmer()
corpus = []
train_data['tokenized_text'] = train_data['tokenized_text'].apply(lambda x:[porter.stem(i) for i in x])
train_data['tokenized_text'] = train_data['tokenized_text'].apply(lambda x:' '.join(x))
for i in train_data.tokenized_text:
    corpus.append(i)
print(corpus)

In [None]:
test_data['tokenized_text'] = test_data['tokenized_text'].apply(lambda x:[porter.stem(i) for i in x])
test_data['tokenized_text'] = test_data['tokenized_text'].apply(lambda x:' '.join(x))

# train_data.head(20)

In [None]:
train_data.shape

In [None]:
# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer(max_features=1500)


# # Independent variable
# X = (cv.fit_transform(corpus)).toarray()


# # dependent variable
# y = train_data['sentiment']

# # Counts
# count = y.value_counts()
# print(count)

# # Split the dataset
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
train_data.isnull().sum()

In [None]:
train_data.to_csv('../data/traindata_check.csv',index=False)

In [None]:
traindata_check = pd.read_csv('../data/traindata_check.csv')

In [None]:
traindata_check.isnull().sum()

In [None]:
train_data.shape

In [None]:
from textblob import TextBlob
# Creating Polarity Column using TextBlob
tb_polarity = []
for sentence in train_data['tokenized_data']:
    temp = TextBlob(sentence)
    tb_polarity.append(temp.sentiment[0])
train_data['polarity'] = tb_polarity

test_polarity =[]
for sentence in test_data['tokenized_data']:
    temp = TextBlob(sentence)
    test_polarity.append(temp.sentiment[0])
# print(tb_polarity)
test_data['polarity'] = test_polarity

In [None]:
train_data['polarity'].isnull().sum()

In [None]:
test_data.head()

In [None]:
test_data.shape

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tweets_train = tfidf_vectorizer.fit_transform(train_data['tokenized_text'])

In [None]:
tweets_test = tfidf_vectorizer.transform(test_data['tokenized_text'])

In [None]:
X=tweets_train.toarray()
# X = pd.DataFrame(X)

# X.index = train_data.index

# X['polarity'] = train_data['polarity']
# train_data['sentiment']

In [None]:
# import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score



# ratio to split into training and test set
ratio = int(len(train_data)*0.75)

# logistic regression model
logreg = LogisticRegression(random_state=2) 

# Code starts here

# TF-IDF feature matrix
# tfidf_vedtorizer = TfidfVectorizer(max_df=0.90,min_df=2,max_features=1000,stop_words='english')

# # fit and transform tweets
# tweets = tfidf_vedtorizer.fit_transform(train_data['tokenized_data'])


# split into train and test
X_train,X_test,y_train,y_test = train_test_split(X,train_data.sentiment, test_size=0.25,random_state=22)
# X_train = tweets[:ratio,:]
# X_test = tweets[ratio:,:]
# y_train = train_data['sentiment'].iloc[:ratio]
# y_test = train_data['sentiment'].iloc[ratio:]

# fit on training data
logreg.fit(X_train,y_train)

# make predictions
prediction = logreg.predict(X_test)
# prediction_int = (prediction[:,1] >= 0.3).astype(int)

# print out accuracy
f1 = f1_score(y_test,prediction,average='weighted')
f1

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score

# Instantiate calssifier
rf = RandomForestClassifier(random_state=2)

# fit model on training data
rf.fit(X_train,y_train)

# predict on test data
y_pred = rf.predict(X_test)

# calculate the accuracy score
score = f1_score(y_test,y_pred,average='weighted')

# calculate the precision
# precision = precision_score(y_test,y_pred)

# display 'score' and 'precision'

print(score)
# print(precision


In [None]:
y_pred_rf_grid = gridF.predict(X_test)
score_grid_rf = f1_score(y_test,y_pred_rf_grid,average='weighted')


In [None]:
score_grid_rf

In [None]:
# import packages
from imblearn.over_sampling import SMOTE

# Instantiate smote
smote = SMOTE(random_state=9)

# fit_sample onm training data
X_smote,y_smote = smote.fit_sample(X_train,y_train)

# fit modelk on training data
rf.fit(X_smote,y_smote)

# predict on test data
y_pred = rf.predict(X_test)

# calculate the accuracy score
score = f1_score(y_test,y_pred,average='weighted')

# calculate the precision
# precision = precision_score(y_test,y_pred)

# display precision and score
print(score)
# print(precision)

In [None]:
n_estimators = [100, 300, 500, 800, 1200]
max_depth = [5, 8, 15, 25, 30]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10] 

hyperF = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
             min_samples_leaf = min_samples_leaf)

gridF = GridSearchCV(rf, hyperF, cv = 3, verbose = 1, 
                      n_jobs = -1)
gridF.fit(X_smote, y_smote)
print(gridF.best_params_)
print(gridF.best_estimator_)

rf_grid_predict = gridF.predict(X_test)
rf_grid_score = f1_score(y_test,rf_grid_predict)
print(rf_grid_score)

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV 
# svm = LinearSVC(random_state = 42)
# svm.fit(X_train, y_train)
# y_pred = svm.predict(X_test)
# score = f1_score(y_test,y_pred,average='weighted')
# print(score)

param_grid = {'C': [0.1, 0.1,1], 
              'kernel': ['rbf']}  
  
grid = GridSearchCV(
    SVC(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(X_smote, y_smote) 

print(grid.best_params_) 
  
# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 


grid_predictions = grid.predict(X_test) 
  
# print classification report 
print(f1_score(y_test, grid_predictions,average='weighted')) 


In [None]:
svc_model = SVC(C=1, gamma = 1, kernel = 'rbf')
svc_model.fit(X_train,y_train)
score_svc  = f1_score(y_test,svc_model.predict(X_test),average='weighted')
print(score_svc)

In [None]:
svc_model_smote = SVC(C=1, gamma = 1, kernel = 'rbf')
svc_model_smote.fit(X_smote,y_smote)
score_svc  = f1_score(y_test,svc_model_smote.predict(X_test),average='weighted')
print(score_svc)

### Test Data Modelling 

In [None]:
y_train = train_data['sentiment']

In [None]:
y = tweets_test.toarray()

In [None]:
# y= pd.DataFrame(y)

In [None]:
ID = test_data['tweet_id']

In [None]:
# y.index.equals(test_data.index)
# y.index.intersection(test_data.index).empty 

In [None]:
# y['polarity'] = test_data['polarity']

In [None]:
smote = SMOTE(random_state=9)
X_smote,y_smote = smote.fit_sample(X,y_train)
svc_model = SVC(C=1, gamma = 1, kernel = 'rbf')
svc_model.fit(X_smote,y_smote)
y_pred_svc = svc_model.predict(y)

In [None]:
y_pred_svc

In [None]:
smote = SMOTE(random_state=9)

# fit_sample onm training data
X_smote,y_smote = smote.fit_sample(X,y_train)

# Instantiate calssifier
rf_test = RandomForestClassifier(random_state=2)

# fit model on training data
rf.fit(X_smote,y_smote)

#
predict on test data
y_pred_rf = rf.predict(y)

# calculate the accuracy score
# score = f1_score(y_test,y_pred,average='micro')

# calculate the precision
# precision = precision_score(y_test,y_pred)

# display 'score' and 'precision'

# print(score)
# print(precision


In [None]:
y_pred_rf

In [None]:
y_pred_rf.shape

In [None]:
prediction = pd.DataFrame(y_pred_svc,columns=['sentiment'])

In [None]:
submission_trial_10 = pd.concat([ID,prediction['sentiment']],1)
submission_trial_10.to_csv('../data/submission_trial_10.csv',index=False)

In [None]:
submission_file = pd.read_csv('../data/submission_trial_10.csv')
submission_file.head()