In [1]:
# Import the required libraries.

import pandas as pd
import numpy as np
import nltk
import string
import re
nltk.download('stopwords')
nltk.download('punkt')
from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [2]:
# Rename the column names and read the data.
columns = ['TweetID','Entity', 'Output', 'Tweet']
data = pd.read_csv(r'../input/twitter-entity-sentiment-analysis/twitter_validation.csv', names=columns, header = None)

In [3]:
#Printing the first five  rows of the data.
data.head()

In [4]:
#For storing clean data.
main =[]

# Storing puntuations and special characters like(-, /, $) using re library.
a = re.compile('[%s]' % re.escape(string.punctuation ))

# Storing stopword such as (a, the, is, etc) using nltk library.
stop_words = set(stopwords.words('english'))

# Making an object for Stemmer and Lemmatization using nlkt library.
p_stem = PorterStemmer()
lem = WordNetLemmatizer()

# Iterating complete data using tqdm library.
for i in tqdm(range(len(data['Tweet']))):
    # Tokenization
    token = word_tokenize(str(data['Tweet'][i]))
    # Converting all data into lower case.
    token = [w.lower() for w in token ]
    # Removing all puntuations and special characters.
    token = [a.sub('',w) for w in token]
    # Removing all alpha-numeric from data.
    token = [w for w in token if w.isalpha()]
    # Removing all stopword from data.
    b = [w for w in token if w not in stop_words]
    # Stemming
    b = [p_stem.stem(w) for w in b]
    # Lemmatization
    b = [lem.lemmatize(w) for w in b]
    # Joining final data into a string.
    c = ' '.join(b)
    # Appending final data into main list.
    main.append(c)
    
    

In [5]:
#Assigning main to the data
data['processed_Tweet'] = main
# Deleting the Tweet column from data.
data = data.drop('Tweet', axis=1)

In [6]:
data.head()

In [7]:
# Converting text into vector using sklearn library.
cnt = CountVectorizer()
# Converting vector into array.
X = cnt.fit_transform(main).toarray()

In [8]:
# Printing all unique outputs.
data['Output'].unique()


In [9]:
# As output has four label, so we converted them into binary using OneHotEncoder. 
enc =  OneHotEncoder(handle_unknown='ignore')
d = np.array(data['Output'])
d = d.reshape(-1,1)
y = enc.fit_transform(d).toarray()

In [10]:
# Splitting the data into train and test.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42) 

In [11]:
X_train

In [12]:
y_train