# Importing Libraries

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("training_processed_dataset.csv")
df.head()

Unnamed: 0,review,sentiment,clean_lowercase,clean_lowercase_re,no_stopwords,tokens,lemmatized,revised_tokens
0,In Panic In The Streets Richard Widmark plays ...,1,in panic in the streets richard widmark plays ...,in panic in the streets richard widmark plays ...,panic streets richard widmark plays us navy do...,"['panic', 'streets', 'richard', 'widmark', 'pl...",panic street richard widmark play u navy docto...,"['panic', 'street', 'richard', 'widmark', 'pla..."
1,If you ask me the first one was really better ...,0,if you ask me the first one was really better ...,if you ask me the first one was really better ...,ask first one really better one look sarah g r...,"['ask', 'first', 'one', 'really', 'better', 'o...",ask first one really better one look sarah g r...,"['ask', 'first', 'one', 'really', 'better', 'o..."
2,I am a big fan a Faerie Tale Theatre and I've ...,1,i am a big fan a faerie tale theatre and i've ...,i am a big fan a faerie tale theatre and ive s...,big fan faerie tale theatre ive seen one best ...,"['big', 'fan', 'faerie', 'tale', 'theatre', 'i...",big fan faerie tale theatre ive seen one best ...,"['big', 'fan', 'faerie', 'tale', 'theatre', 'i..."
3,I just finished reading a book about Dillinger...,0,i just finished reading a book about dillinger...,i just finished reading a book about dillinger...,finished reading book dillinger movie horribly...,"['finished', 'reading', 'book', 'dillinger', '...",finished reading book dillinger movie horribly...,"['finished', 'reading', 'book', 'dillinger', '..."
4,Greg Davis and Bryan Daly take some crazed sta...,0,greg davis and bryan daly take some crazed sta...,greg davis and bryan daly take some crazed sta...,greg davis bryan daly take crazed statements t...,"['greg', 'davis', 'bryan', 'daly', 'take', 'cr...",greg davis bryan daly take crazed statement te...,"['greg', 'davis', 'bryan', 'daly', 'take', 'cr..."


In [3]:
df.shape

(25000, 8)

# Model Training : LDA

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [5]:
raw_vectorizer = CountVectorizer()
raw_doc_term_matrix = raw_vectorizer.fit_transform(df["lemmatized"])

In [6]:
lda = LatentDirichletAllocation(n_components=5,random_state=42)
lda.fit(raw_doc_term_matrix)

In [7]:
for idx, topic in enumerate(lda.components_):
    print(f"Topic {idx}:")
    print([raw_vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-10 - 1:-1]])

Topic 0:
['movie', 'film', 'one', 'story', 'character', 'great', 'series', 'love', 'time', 'book']
Topic 1:
['movie', 'film', 'one', 'like', 'good', 'time', 'really', 'even', 'get', 'would']
Topic 2:
['film', 'one', 'movie', 'story', 'like', 'character', 'scene', 'good', 'time', 'also']
Topic 3:
['film', 'one', 'year', 'war', 'play', 'match', 'world', 'young', 'role', 'also']
Topic 4:
['film', 'character', 'one', 'time', 'make', 'like', 'story', 'role', 'also', 'great']


In [8]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict[f"Topic {topic_idx+1}"] = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]
    return topic_dict

no_top_words = 10
topics = display_topics(lda, raw_vectorizer.get_feature_names_out(), no_top_words)

topics_df = pd.DataFrame(topics)

topics_df

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5
0,movie,movie,film,film,film
1,film,film,one,one,character
2,one,one,movie,year,one
3,story,like,story,war,time
4,character,good,like,play,make
5,great,time,character,match,like
6,series,really,scene,world,story
7,love,even,good,young,role
8,time,get,time,role,also
9,book,would,also,also,great


# Model Training : LSA

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [10]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["lemmatized"])

In [11]:
lsa = TruncatedSVD(n_components=5, random_state=42)
X_lsa = lsa.fit_transform(X)

In [12]:
terms = vectorizer.get_feature_names_out()
for i, comp in enumerate(lsa.components_):
    print(f"Topic {i}: ")
    print([terms[j] for j in comp.argsort()[:-10 - 1:-1]])

Topic 0: 
['movie', 'film', 'one', 'like', 'good', 'time', 'really', 'character', 'story', 'bad']
Topic 1: 
['movie', 'bad', 'watch', 'worst', 'dont', 'ever', 'waste', 'seen', 'funny', 'good']
Topic 2: 
['show', 'episode', 'series', 'season', 'tv', 'character', 'family', 'love', 'kid', 'life']
Topic 3: 
['bad', 'show', 'worst', 'dont', 'episode', 'even', 'like', 'ever', 'acting', 'really']
Topic 4: 
['film', 'show', 'great', 'movie', 'seen', 'watch', 'ever', 'episode', 'tv', 'saw']


In [13]:
no_top_words = 10
topics = display_topics(lsa, vectorizer.get_feature_names_out(), no_top_words)

topics_df = pd.DataFrame(topics)

topics_df

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5
0,movie,movie,show,bad,film
1,film,bad,episode,show,show
2,one,watch,series,worst,great
3,like,worst,season,dont,movie
4,good,dont,tv,episode,seen
5,time,ever,character,even,watch
6,really,waste,family,like,ever
7,character,seen,love,ever,episode
8,story,funny,kid,acting,tv
9,bad,good,life,really,saw


# Model Evaluation

In [14]:
lda.perplexity(raw_doc_term_matrix)

np.float64(5526.723408009557)