In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
root_path = 'gdrive/My Drive/Twitter_sentiment/'

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


import re

print("Tensorflow Version",tf.__version__)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Tensorflow Version 2.4.1


In [None]:
!ls 'gdrive/My Drive/Twitter_sentiment/input'

training.1600000.processed.noemoticon.csv


In [None]:
col_names=['target', 'ids', 'date', 'flag', 'user', 'tweet']

In [None]:
df = pd.read_csv('gdrive/My Drive/Twitter_sentiment/input/training.1600000.processed.noemoticon.csv',  encoding='latin', header=None, names=col_names)

In [None]:
df.head()

Unnamed: 0,target,ids,date,flag,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
df = df.drop(["ids", "date", "flag", "user"], axis=1)
df['target'] = df['target'].replace(4, 1)

In [None]:
## Data Cleaning

# Lower Case
df["tweet"] = df["tweet"].map(lambda x: x.lower())

# Removing Emails
df["tweet"] = df["tweet"].str.replace(r'(\w+\.)*\w+@(\w+\.)+[a-z]+', '')

# Removing URL's
df["tweet"] = df["tweet"].str.replace(r'(http|ftp|https)://[-\w.]+(:\d+)?(/([\w/_.]*)?)?|www[\.]\S+', '')

# Removing hashtag
df["tweet"] = df["tweet"].str.replace(r'[\@\#]\S+', '')

In [None]:
df.head()

Unnamed: 0,target,tweet
0,0,"- awww, that's a bummer. you shoulda got da..."
1,0,is upset that he can't update his facebook by ...
2,0,i dived many times for the ball. managed to s...
3,0,my whole body feels itchy and like its on fire
4,0,"no, it's not behaving at all. i'm mad. why am..."


In [None]:

# Converting Emoticons
emo_info = {
    # positive emoticons
    ":‑)": " good ",
    ":)": " good ",
    ";)": " good ",
    ":-}": " good ",
    "=]": " good ",
    "=)": " good ",
    ";d": " good ",
    ":d": " good ",
    ":dd": " good ",
    "xd": " good ",
    ":p": " good ",
    "xp": " good ",
    "<3": " love ",

    # negative emoticons
    ":‑(": " sad ",
    ":‑[": " sad ",
    ":(": " sad ",
    "=(": " sad ",
    "=/": " sad ",
    ":{": " sad ",
    ":/": " sad ",
    ":|": " sad ",
    ":-/": " sad ",
    ":o": " shock "
}

emo_info_order = [k for (k_len, k) in reversed(sorted([(len(k), k) for k in emo_info.keys()]))]

def emo_repl(phrase):
    for k in emo_info_order:
        phrase = phrase.replace(k, emo_info[k])
    return phrase

df['tweet'] = df['tweet'].apply(emo_repl)


In [None]:
# Expanding Contractions

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"\bdon't\b", "do not", phrase)
    phrase = re.sub(r"\bdoesn't\b", "does not", phrase)
    phrase = re.sub(r"\bdidn't\b", "did not", phrase)
    phrase = re.sub(r"\bdidnt\b", "did not", phrase)
    phrase = re.sub(r"\bhasn't\b", "has not", phrase)
    phrase = re.sub(r"\bhaven't\b", "have not", phrase)
    phrase = re.sub(r"\bhavent\b", "have not", phrase)
    phrase = re.sub(r"\bhadn't\b", "had not", phrase)
    phrase = re.sub(r"\bwon't\b", "will not", phrase)
    phrase = re.sub(r"\bwouldn't\b", "would not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)

    # using regular expressions to expand the contractions
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)

    return phrase

df['tweet'] = df['tweet'].apply(decontracted)

In [None]:

# Removing Stop Words

stop = stopwords.words('english')
manual_sw_list = ['retweet', 'retwet', 'rt', 'oh', 'dm', 'mt', 'ht', 'ff', 'shoulda', 'woulda', 'coulda', 'might', 'im', 'tb', 'mysql', 'hah', "a", "an", "the", "and", "but", "if",
                  "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over",
                  "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "nor", "only", "own", "same", "so", "than", "too", "very", "s",
                  "t", "just", "don", "now", 'tweet', 'x', 'f']

stop.extend(manual_sw_list)

df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))


In [None]:
df.head()

Unnamed: 0,target,tweet
0,0,"- awww, bummer. got david carr third day it. good"
1,0,upset update facebook texting it... cry result...
2,0,dived many times ball. managed save 50% rest g...
3,0,whole body feels itchy like fire
4,0,"no, behaving all. mad. here? see there."


In [None]:
# Applying Lemmatization

lem = WordNetLemmatizer()
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([lem.lemmatize(word, 'v') for word in x.split()]))

# Removing extra punctuations

df["tweet"] = df["tweet"].str.replace(r'[^\w\s]', '')

# Removing Digits

df["tweet"] = df["tweet"].str.replace(r'[0-9]+', '')

# Removing Non-Alphabet

non_alphabet = re.compile(r'[^a-z]+')
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word for word in x.split() if non_alphabet.search(word) is None]))

# Removing Duplicate Letters

df['tweet'] = df['tweet'].str.replace(r'([a-z])\1{1,}', r'\1\1')
df['tweet'] = df['tweet'].apply(lambda x: ' '.join([word if len(wordnet.synsets(word)) > 0 else re.sub(r'([a-z])\1{1,}', r'\1', word) for word in x.split()]))

# Cutting Duplicate Laughing Sound

df['tweet'] = df['tweet'].str.replace(r'(ha)\1{1,}', r'\1')

# Remove Empty Rows

df.drop(df[df["tweet"] == ''].index, inplace=True)
df = df.reset_index(drop=True)



In [None]:
df.head()

Unnamed: 0,target,tweet
0,0,aw bummer get david car third day it good
1,0,upset update facebok texting it cry result sch...
2,0,dive many time ball manage save rest go bound
3,0,whole body feel itchy like fire
4,0,no behave all mad here see there


In [None]:
df.to_csv(root_path+'input\clean_input.csv')

In [None]:
df.target.value_counts()

0    796936
1    796317
Name: target, dtype: int64

In [None]:
cleandf = pd.read_csv('gdrive/My Drive/Twitter_sentiment/input/clean_input.csv')

In [None]:
cleandf.head()

Unnamed: 0.1,Unnamed: 0,target,tweet
0,0,0,aw bummer get david car third day it good
1,1,0,upset update facebok texting it cry result sch...
2,2,0,dive many time ball manage save rest go bound
3,3,0,whole body feel itchy like fire
4,4,0,no behave all mad here see there


In [None]:
cleandf.columns

Index(['Unnamed: 0', 'target', 'tweet'], dtype='object')

In [None]:
cleandf.drop('Unnamed: 0', axis=1,inplace=True)

In [None]:
cleandf.head()

Unnamed: 0,target,tweet
0,0,aw bummer get david car third day it good
1,0,upset update facebok texting it cry result sch...
2,0,dive many time ball manage save rest go bound
3,0,whole body feel itchy like fire
4,0,no behave all mad here see there
