# Text Classification

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import os
# print(os.listdir("../input"))
plt.style.use('ggplot')

In [2]:
filepath_dict = {'yelp':   '/content/drive/MyDrive/Dataset Ai/sentiment labelled sentences/yelp_labelled.txt',
                 'amazon': '/content/drive/MyDrive/Dataset Ai/sentiment labelled sentences/amazon_cells_labelled.txt',
                 'imdb':   '/content/drive/MyDrive/Dataset Ai/sentiment labelled sentences/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)

In [3]:
df_list

[                                              sentence  label source
 0                             Wow... Loved this place.      1   yelp
 1                                   Crust is not good.      0   yelp
 2            Not tasty and the texture was just nasty.      0   yelp
 3    Stopped by during the late May bank holiday of...      1   yelp
 4    The selection on the menu was great and so wer...      1   yelp
 ..                                                 ...    ...    ...
 995  I think food should have flavor and texture an...      0   yelp
 996                           Appetite instantly gone.      0   yelp
 997  Overall I was not impressed and would not go b...      0   yelp
 998  The whole experience was underwhelming, and I ...      0   yelp
 999  Then, as if I hadn't wasted enough of my life ...      0   yelp
 
 [1000 rows x 3 columns],
                                               sentence  label  source
 0    So there is no way for me to plug it in here i...      

In [4]:
df = pd.concat(df_list)
df.iloc[0]

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object

In [5]:
df.head()

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [6]:
df.tail()

Unnamed: 0,sentence,label,source
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb
747,All in all its an insult to one's intelligence...,0,imdb


Now use the CountVectorizer provided by the scikit-learn library to vectorize sentences. It takes the words of each sentence and creates a vocabulary of all the unique words in the sentences. This vocabulary can then be used to create a feature vector of the count of the words:

In [7]:
sentences = ['Rashmi likes ice cream', 'Rashmi hates chocolate.']

In [8]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'Rashmi': 0, 'chocolate': 1, 'cream': 2, 'hates': 3, 'ice': 4, 'likes': 5}

In [9]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

##Defining a Baseline Model
First, you are going to split the data into a training and testing set which will allow you to evaluate the accuracy and see if your model generalizes well. This means whether the model is able to perform well on data it has not seen before. This is a way to see if the model is overfitting.

Overfitting is when a model is trained too well on the training data. You want to avoid overfitting, as this would mean that the model mostly just memorized the training data. This would account for a large accuracy with the training data but a low accuracy in the testing data.

We start by taking the Yelp data set which we extract from our concatenated data set. From there, we take the sentences and labels.

In [10]:
df_yelp = df[df['source'] == 'yelp']
sentences = df_yelp['sentence'].values
y = df_yelp['label'].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)

Create the feature vectors for each sentence of the training and testing set:

In [11]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)

In [12]:
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

CountVectorizer performs tokenization which separates the sentences into a set of tokens. It additionally removes punctuation and special characters and can apply other preprocessing to each word. If you want, you can use a custom tokenizer from the NLTK library with the CountVectorizer or use any number of the customizations which you can explore to improve the performance of your model.

The classification model we are going to use is the logistic regression which is a simple yet powerful linear model that is mathematically speaking in fact a form of regression between 0 and 1 based on the input feature vector. By specifying a cutoff value (by default 0.5), the regression model is used for classification.

In [13]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.796


You can see that the logistic regression reached an impressive 79.6%, but let’s have a look how this model performs on the other data sets that we have. In this script, we perform and evaluate the whole process for each data set that we have:

In [14]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487


# Sentiment Analysis

In [18]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
print(os.listdir("/content/drive/MyDrive/Dataset Ai/sentiment labelled sentences"))
from keras.preprocessing.text import Tokenizer,text_to_word_sequence
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dense
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.manifold import TSNE
from gensim.models import word2vec
from nltk import word_tokenize
from nltk.corpus import stopwords

['readme.txt', 'amazon_cells_labelled.txt', 'yelp_labelled.csv', 'yelp_labelled.txt', 'amazon_cells_labelled.csv', 'imdb_labelled.txt', 'imdb_labelled.csv', 'sentiment labelled sentences']


In [19]:
filesList=os.listdir('/content/drive/MyDrive/Dataset Ai/sentiment labelled sentences/sentiment labelled sentences')
os.listdir('/content/drive/MyDrive/Dataset Ai/sentiment labelled sentences/sentiment labelled sentences')

['yelp_labelled.txt',
 'imdb_labelled.txt',
 'yelp_labelled.csv',
 'amazon_cells_labelled.txt',
 'amazon_cells_labelled.csv',
 'imdb_labelled.csv',
 'readme.txt']

In [20]:
imdb_labelFile='/content/drive/MyDrive/Dataset Ai/sentiment labelled sentences/sentiment labelled sentences/imdb_labelled.txt'
amazon_labelFile='/content/drive/MyDrive/Dataset Ai/sentiment labelled sentences/sentiment labelled sentences/amazon_cells_labelled.txt'
yelp_labelFile='/content/drive/MyDrive/Dataset Ai/sentiment labelled sentences/sentiment labelled sentences/yelp_labelled.txt'

In [21]:
def getReviewSentimentFromFile(file):
    fr=open(file)
    lines=fr.readlines()
    fr.close()
    reviewsentimentList=[]
    for l in lines:
        x=l.split('\t')
        reviewsentimentList.append([str.lstrip(str.rstrip(x[0])),str.lstrip(str.rstrip(x[1]))])
    return reviewsentimentList

In [22]:
rsList=getReviewSentimentFromFile(imdb_labelFile)+getReviewSentimentFromFile(amazon_labelFile)+getReviewSentimentFromFile(yelp_labelFile)
len(rsList[:])

3000

In [23]:
rsList[0]

['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.',
 '0']

In [24]:
rsDF=pd.DataFrame(rsList,columns=['REVIEW','SENTIMENT'])

In [25]:
rsDF.head(5)

Unnamed: 0,REVIEW,SENTIMENT
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [26]:
X=rsDF['REVIEW']
y=rsDF['SENTIMENT']
y=to_categorical(num_classes=2,y=y)

In [27]:
np.shape(y)

(3000, 2)

In [28]:
tok=Tokenizer(lower=True,num_words=10000)

In [29]:
tok.fit_on_texts(X)
seqs=tok.texts_to_sequences(X)
padded_seqs=pad_sequences(seqs,maxlen=100)

In [30]:
def createLSTM():
    model=Sequential()
    model.add(Embedding(10000,100))
    model.add(LSTM(256))
    model.add(Dense(100,activation='sigmoid'))
    model.add(Dense(2,activation='sigmoid'))
    return model

In [31]:
model=createLSTM()
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         1000000   
                                                                 
 lstm (LSTM)                 (None, 256)               365568    
                                                                 
 dense (Dense)               (None, 100)               25700     
                                                                 
 dense_1 (Dense)             (None, 2)                 202       
                                                                 
Total params: 1,391,470
Trainable params: 1,391,470
Non-trainable params: 0
_________________________________________________________________


In [34]:
X_train,X_test,y_train,y_test=train_test_split(
    padded_seqs,y,train_size=0.85,test_size=0.15,random_state=43
    )

In [35]:
np.shape(X_train),np.shape(y_train)

((2550, 100), (2550, 2))

In [36]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])
model.fit(X_train,y_train,batch_size=32,epochs=5,verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f013aedfe10>

In [37]:
acc=model.evaluate(X_test,y_test)[1]*100
print("The model accuracy is {}".format(acc))

The model accuracy is 81.33333325386047
