## Libraries

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from wordcloud import WordCloud

## Data Gathering

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Train.csv')
df.head()

In [None]:
df.info()

In [None]:
df.label.unique()

In [None]:
df.label.value_counts()

In [None]:
df.label.value_counts().plot(kind='pie', figsize=(20,8))
plt.show()

- '0' refers to negative feedback, '1' refers to positive feedback

## Data Preprocessing

In [None]:
def text_prepare(text):
    wordnet = WordNetLemmatizer()
    STOPWORDS = set(stopwords.words('english'))
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [i for i in tokens if len(i)>2]
    tokens = [i for i in tokens if i.isalpha()]
    tokens = [i for i in tokens if i not in STOPWORDS]
    tokens = [wordnet.lemmatize(i) for i in tokens]
    return tokens

In [None]:
df['text'] = df['text'].apply(lambda x: text_prepare(x))

In [None]:
## A dictionary to count the frequency of words
freq_count = {}

In [None]:
for line in df['text']:
    for word in line:
        if word not in freq_count:
            freq_count[word] = 1
        else:
            freq_count[word] += 1


In [None]:
freq_count_sorted = {k: v for k, v in sorted(freq_count.items(), key=lambda item: item[1], reverse=True)}

In [None]:
SET_LIMIT = 5000

In [None]:
word_index_map = {v:k for k,v in enumerate(list(freq_count_sorted.keys())[:SET_LIMIT])}

In [None]:
def text_vector(text, label):
    x = np.zeros(len(word_index_map)+1)
    for word in text:
        if word in word_index_map:
            index = word_index_map[word]
            x[index] += 1
        
    x = x/x.sum()
    x[-1] = label
    return x 

In [None]:
data = np.zeros((len(df), len(word_index_map)+1))

In [None]:
idx = 0
index = 0
for idx in range(len(df)):
    tokens = df.iloc[idx,0]
    label = df.iloc[idx,1]
    data[index,:] = text_vector(tokens, label)
    index += 1

## Data Modelling

In [None]:
X = data[:,:-1]
y = data[:,-1]

In [None]:
model = LogisticRegression()
model.fit(X,y)

In [None]:
model.score(X,y)

## Exploring Test data

In [None]:
test_data = pd.read_csv('/kaggle/input/imdb-dataset-sentiment-analysis-in-csv-format/Test.csv')
test_data.head(5)

In [None]:
test_data['text'] = test_data['text'].apply(lambda x: text_prepare(x))

In [None]:
data2 = np.zeros((len(test_data), len(word_index_map)+1))

In [None]:
idx = 0
index = 0
for idx in range(len(test_data)):
    tokens = test_data.iloc[idx,0]
    label = test_data.iloc[idx,1]
    data2[index,:] = text_vector(tokens, label)
    index += 1

In [None]:
X_test = data2[:,:-1]
y_test = data2[:,-1]

In [None]:
model.predict(X_test)

In [None]:
model.score(X_test,y_test)

## Conclusion

- The model score is good enough
- Positive and Negative Impact words are listed below

### Words Having *Positive* Impact

In [None]:
threshold = 0.8
positive_score = {}
for word,index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight > threshold:
        positive_score[word] = weight

In [None]:
positive_score = {k: v for k, v in sorted(positive_score.items(), key=lambda item: item[1], reverse=True)}

In [None]:
wordcloud = WordCloud(width = 1000, height = 500).generate(" ".join(list(positive_score.keys())))
plt.figure(figsize = (20, 20), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

### Words Having *Negative* Impact

In [None]:
threshold = 1
negative_score = {}
for word,index in word_index_map.items():
    weight = model.coef_[0][index]
    if weight < -threshold:
        negative_score[word] = weight

In [None]:
negative_score = {k: v for k, v in sorted(negative_score.items(), key=lambda item: item[1], reverse=False)}

In [None]:
wordcloud = WordCloud(width = 1000, height = 500).generate(" ".join(list(negative_score.keys())))
plt.figure(figsize = (20, 20), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0)  
plt.show() 

- Thank you. Please share your feedback to make it better. 