In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

In [2]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

In [3]:
df = pd.read_csv("twitter_data.csv",encoding='latin1',header=None)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
df = df[[5,0]]

In [6]:
df.columns = ['tweets','sentiment']

In [7]:
df.head()

Unnamed: 0,tweets,sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [8]:
df.sentiment.value_counts()

4    800000
0    800000
Name: sentiment, dtype: int64

In [9]:
df.sentiment.unique()

array([0, 4], dtype=int64)

## Word Count

In [10]:
df["word_counts"] = df['tweets'].apply(lambda x: len(str(x).split()))

In [11]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19
1,is upset that he can't update his Facebook by ...,0,21
2,@Kenichan I dived many times for the ball. Man...,0,18
3,my whole body feels itchy and like its on fire,0,10
4,"@nationwideclass no, it's not behaving at all....",0,21


In [12]:
df["char_counts"] = df['tweets'].apply(lambda x: len(x))
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19,115
1,is upset that he can't update his Facebook by ...,0,21,111
2,@Kenichan I dived many times for the ball. Man...,0,18,89
3,my whole body feels itchy and like its on fire,0,10,47
4,"@nationwideclass no, it's not behaving at all....",0,21,111


### Average Word Length

In [13]:
def get_avg_word_len(x):
    words = x.split()
    word_len = 0
    for word in words:
        word_len += len(word)
    return word_len/len(words)

In [14]:
df['avg_word_len'] = df['tweets'].apply(lambda x: get_avg_word_len(x))

In [15]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19,115,5.052632
1,is upset that he can't update his Facebook by ...,0,21,111,4.285714
2,@Kenichan I dived many times for the ball. Man...,0,18,89,3.944444
3,my whole body feels itchy and like its on fire,0,10,47,3.7
4,"@nationwideclass no, it's not behaving at all....",0,21,111,4.285714


### Count Stop Words

In [16]:
print(STOP_WORDS)

{'being', 'least', 'among', 'amongst', 'wherever', 'hence', 'therefore', 'made', 'except', 'such', 'between', 'i', 'below', 'sixty', 'its', 'rather', 'never', 'in', 'either', 'must', 'serious', 'ten', 'someone', 'twenty', 'toward', 'anything', 'meanwhile', 'still', 'nowhere', 'used', 'why', 'side', 'something', 'along', 'formerly', "'m", '‘s', 'almost', 'besides', 'herself', 'if', 'else', 'or', 'whither', 'out', 'latter', "'d", 'hereby', 'you', 'our', 'too', 'other', 'noone', 'should', 'itself', 'becoming', 'did', 'every', 'full', 'back', 'against', 'whence', 'they', 'say', 'ourselves', 'show', 'take', 'thus', '’ve', 'same', 'only', 'amount', 'my', 'seems', 'give', 'though', '’re', 'without', 'many', 'afterwards', 'top', 'she', 'yet', "'ve", 'yourself', 'where', 'some', 'make', 'much', 'see', 'while', 'your', 'eight', 'whereafter', 'everyone', 'move', 'sometimes', 'he', 'seem', 'been', 'be', 'whatever', 'various', 'even', 'done', 'there', 'everything', 'nobody', 'thereupon', 'via', 'el

In [17]:
df["stop_words_len"] = df.tweets.apply(lambda x: len( [t for t in x.split() if t in STOP_WORDS ]  ))

In [18]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_words_len
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19,115,5.052632,4
1,is upset that he can't update his Facebook by ...,0,21,111,4.285714,9
2,@Kenichan I dived many times for the ball. Man...,0,18,89,3.944444,7
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5
4,"@nationwideclass no, it's not behaving at all....",0,21,111,4.285714,10


### Count #HashTag And @Mentions

In [19]:
df["hashtag_count"] = df.tweets.apply(lambda x : len([t for t in x.split() if t.startswith("#")]))
df["mention_count"] = df.tweets.apply(lambda x : len([t for t in x.split() if t.startswith("@")]))

In [20]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtag_count,mention_count
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19,115,5.052632,4,0,1
1,is upset that he can't update his Facebook by ...,0,21,111,4.285714,9,0,0
2,@Kenichan I dived many times for the ball. Man...,0,18,89,3.944444,7,0,1
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0
4,"@nationwideclass no, it's not behaving at all....",0,21,111,4.285714,10,0,1


### If Numeric digits are present in tweets

In [21]:
df["numeric_count"] = df["tweets"].apply(lambda x : len([t for t in x.split() if t.isdigit()]))

In [22]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtag_count,mention_count,numeric_count
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19,115,5.052632,4,0,1,0
1,is upset that he can't update his Facebook by ...,0,21,111,4.285714,9,0,0,0
2,@Kenichan I dived many times for the ball. Man...,0,18,89,3.944444,7,0,1,0
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0,0
4,"@nationwideclass no, it's not behaving at all....",0,21,111,4.285714,10,0,1,0


### Upper Case Word Count

In [23]:
df["UpperCase_count"] = df["tweets"].apply(lambda x : len([t for t in x.split() if t.isupper() and len(x)>3]))

In [24]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtag_count,mention_count,numeric_count,UpperCase_count
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,19,115,5.052632,4,0,1,0,1
1,is upset that he can't update his Facebook by ...,0,21,111,4.285714,9,0,0,0,0
2,@Kenichan I dived many times for the ball. Man...,0,18,89,3.944444,7,0,1,0,1
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0,0,0
4,"@nationwideclass no, it's not behaving at all....",0,21,111,4.285714,10,0,1,0,1


In [25]:
df.loc[96]['tweets']

"so rylee,grace...wana go steve's party or not?? SADLY SINCE ITS EASTER I WNT B ABLE 2 DO MUCH  BUT OHH WELL....."

### Preprocessing and cleaning

#### Lower Case Conversion

In [26]:
df['tweets'] = df.tweets.apply(lambda x: x.lower())

In [27]:
df.head(2)

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtag_count,mention_count,numeric_count,UpperCase_count
0,"@switchfoot http://twitpic.com/2y1zl - awww, t...",0,19,115,5.052632,4,0,1,0,1
1,is upset that he can't update his facebook by ...,0,21,111,4.285714,9,0,0,0,0


#### Contraction to expansion

In [28]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and "}

In [29]:
def contract(x):
    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key, value)
        return x
    else:
        return x

In [30]:
x = 'hi, i would be happy'
contract(x)

'hi, i would be happy'

In [31]:
df["tweets"] = df.tweets.apply(lambda x: contract(x))

### Remove URLs 

In [32]:
import re


In [33]:
df['urls_flag'] = df['tweets'].apply(lambda x: len(re.findall(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', x)))


In [34]:
df['tweets'] = df['tweets'].apply(lambda x: re.sub(r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '', x))


In [35]:
df.head()


Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtag_count,mention_count,numeric_count,UpperCase_count,urls_flag
0,"@switchfoot - awww, that is a bummer. you sh...",0,19,115,5.052632,4,0,1,0,1,1
1,is upset that he cannot update his facebook by...,0,21,111,4.285714,9,0,0,0,0,0
2,@kenichan i dived many times for the ball. man...,0,18,89,3.944444,7,0,1,0,1,0
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0,0,0,0
4,"@nationwideclass no, it is not behaving at all...",0,21,111,4.285714,10,0,1,0,1,0


### Removing Retweets

In [36]:
df['tweets'] = df['tweets'].apply(lambda x: re.sub('RT', "", x))

### Removing Special Characters and Puncuations

In [37]:
df['tweets'] = df['tweets'].apply(lambda x: re.sub('[^A-Z a-z 0-9-]+', '', x))


In [38]:
df.head()


Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtag_count,mention_count,numeric_count,UpperCase_count,urls_flag
0,switchfoot - awww that is a bummer you shoul...,0,19,115,5.052632,4,0,1,0,1,1
1,is upset that he cannot update his facebook by...,0,21,111,4.285714,9,0,0,0,0,0
2,kenichan i dived many times for the ball manag...,0,18,89,3.944444,7,0,1,0,1,0
3,my whole body feels itchy and like its on fire,0,10,47,3.7,5,0,0,0,0,0
4,nationwideclass no it is not behaving at all i...,0,21,111,4.285714,10,0,1,0,1,0


### Removing Accented Characters

In [39]:
import unicodedata


In [40]:
def remove_accented_chars(x):
    x = unicodedata.normalize('NFKD', x).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return x

In [41]:
x = 'Áccěntěd těxt'
remove_accented_chars(x)

'Accented text'

### Removing Stop Words


In [42]:
df['tweets'] = df['tweets'].apply(lambda x: " ".join([t for t in x.split() if t not in STOP_WORDS]))


In [43]:
df.head()

Unnamed: 0,tweets,sentiment,word_counts,char_counts,avg_word_len,stop_words_len,hashtag_count,mention_count,numeric_count,UpperCase_count,urls_flag
0,switchfoot - awww bummer shoulda got david car...,0,19,115,5.052632,4,0,1,0,1,1
1,upset update facebook texting cry result schoo...,0,21,111,4.285714,9,0,0,0,0,0
2,kenichan dived times ball managed save 50 rest...,0,18,89,3.944444,7,0,1,0,1,0
3,body feels itchy like fire,0,10,47,3.7,5,0,0,0,0,0
4,nationwideclass behaving mad,0,21,111,4.285714,10,0,1,0,1,0


## Model Building

In [44]:
X = df["tweets"]
y = df["sentiment"]

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [48]:
tvec = TfidfVectorizer()
log = LogisticRegression()

In [49]:
#it executes all the steps one by one
from sklearn.pipeline import Pipeline

In [60]:
# this will first create a vectorizer and then create a model
model = Pipeline([('vectorizer',tvec),('classifier',log)])

In [61]:
model.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier', LogisticRegression())])

In [52]:
from sklearn.metrics import confusion_matrix

In [53]:
predictions = model.predict(X_test)

In [54]:
confusion_matrix(predictions,y_test)

array([[120690,  32086],
       [ 38804, 128420]], dtype=int64)

### Model Predictions

In [55]:
from sklearn.metrics import accuracy_score,precision_score,recall_score

In [56]:
print("Accuracy : ",accuracy_score(predictions,y_test))
print("Precision : ",precision_score(predictions,y_test,average='weighted'))
print("Recall : ",recall_score(predictions,y_test,average='weighted'))

Accuracy :  0.77846875
Precision :  0.7793796503192831
Recall :  0.77846875


## Predicting



### 0 - Negative
### 4 - Positive

In [57]:
example = ["I hate you"]
model.predict(example)

array([0], dtype=int64)

In [66]:
model.predict(["So happy the Greatest Of All Time will meet again tonight It's gonna be a showdown Watch out  Ronaldo"])

array([4], dtype=int64)

In [62]:
model.predict(["I need to say this so people know how big of a mistake this was, I was traumatized by Human Centipede back in 2006 & got permanent thoughts of it & occasional nightmares, it should've NEVER been made and when I type Human, that movie always pops up. #horrormovies #hate #concerns"])

array([0], dtype=int64)

In [64]:
model.predict(["As cases of Covid-19 continue to rise across the country, a poll of firefighters in the Fire Department of New York City found that nearly 55% of respondents would not get a Covid-19 vaccine if offered by the department, their union president told CNN"])

array([4], dtype=int64)

In [65]:
model.predict(["way too much money invested by these pharmaceuticals than to create a faulty fatal vaccine that would be financial suicide. i'm so sick of this ''conspiracy malfeasance'' perpetuated by the king dictator in this country. i will take the vaccine the minute it's offered to me."])

array([4], dtype=int64)