# Machine Learning  Text Classification With Python and Keras
# shahed Alkhateeb
# Date : 2021-10-31
##  sentiment analysis

In [48]:

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import pandas as pd


In [49]:
filepath_dict = {'yelp':   '/home/shahd/text-classifier/dataset/yelp_labelled.txt',
                 'amazon': '/home/shahd/text-classifier/dataset/amazon_cells_labelled.txt',
                 'imdb':   '/home/shahd/text-classifier/dataset/imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


In [50]:
sentences = ['John likes ice cream', 'John hates chocolate.']

In [51]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)

In [52]:
vectorizer.fit(sentences)

CountVectorizer(lowercase=False, min_df=0)

In [53]:
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

In [54]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

In [55]:
df_yelp = df[df['source'] == 'yelp']

In [56]:
sentences = df_yelp['sentence'].values

In [57]:
y = df_yelp['label'].values
print(y)

[1 0 0 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 0 0 0 1 1 1 1 1 0 1 0 0 1 0 1 0 1 1 1
 0 1 0 1 0 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 0
 0 0 0 1 1 0 0 0 0 1 0 1 0 1 1 1 0 1 0 1 0 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0
 0 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 0 1 1 0 1 1 1 1 0 1 0 0 0 0 1 1 0 0
 0 0 1 1 0 0 1 1 1 1 1 0 0 1 1 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 0 0 0 0 0 1
 1 0 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 0 1 1 1 0 0 0 1 0 0 1 0 1 1 0 1 0 1 0 0
 0 0 0 1 1 1 0 1 1 0 1 0 1 0 0 1 0 1 0 1 0 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1
 0 1 0 1 1 1 1 0 1 1 0 1 1 1 1 1 0 1 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1
 0 1 0 1 1 0 0 0 1 0 0 0 1 1 1 0 1 0 1 0 0 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1
 0 1 1 0 0 1 0 0 1 1 1 0 1 1 1 1 1 0 0 1 0 1 1 0 1 1 1 0 1 1 0 1 0 0 1 1 1
 0 0 1 1 0 1 0 1 0 0 0 1 1 0 0 0 1 0 0 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 0 1
 1 1 0 1 1 0 1 0 0 0 1 1 1 1 0 0 0 0 1 1 0 0 1 0 1 1 0 1 0 1 1 1 1 0 1 1 0
 1 1 0 0 1 1 0 1 0 0 0 0 1 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 1 1 0
 1 0 0 1 1 1 0 0 1 1 1 0 

In [58]:
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)
print("taining Data :",sentences_train)
print("------------------------------------")
print("Testing Data :",sentences_test)
print("------------------------------------")
print(y_train)


taining Data : ['The food was barely lukewarm, so it must have been sitting waiting for the server to bring it out to us.'
 'Sorry, I will not be getting food from here anytime soon :('
 'Of all the dishes, the salmon was the best, but all were great.'
 'The fries were not hot, and neither was my burger.'
 "In fact I'm going to round up to 4 stars, just because she was so awesome."
 'Will go back next trip out.'
 'This was my first crawfish experience, and it was delicious!'
 "I could barely stomach the meal, but didn't complain because it was a business lunch."
 'A great way to finish a great.'
 'Best service and food ever, Maria our server was so good and friendly she made our day.'
 'Good food , good service .'
 'My drink was never empty and he made some really great menu suggestions.'
 'Be sure to order dessert, even if you need to pack it to-go - the tiramisu and cannoli are both to die for.'
 'The food was excellent and service was very good.'
 'All of the tapas dishes were delic

In [59]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)
print("vectorizer :",vectorizer)
X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

vectorizer : CountVectorizer()


<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

In [60]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)

print("Accuracy:", score)

Accuracy: 0.796


In [61]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487


In [62]:
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


In [81]:
new_data = ['love','good','new','nice','bad','wow','Stopped by during the late May bank holiday','stop','yes','no','great']
data = vectorizer.transform(new_data)
result=classifier.predict(data)
result

array([1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1])