In [1]:
# import the required packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Problem1

## 1. Text Data Preprocessing

### 1.1 Load the Text Data

In [2]:
dat = pd.read_csv("AA_movie_train_data.csv")
dat.head()

Unnamed: 0,Title,Genre,Description
0,Windup (2006),action,Windup is a very original comedy. We follow B...
1,Hitman (2014),action,"Rana and Shuvo, two siblings, are very differ..."
2,Taken by Force (2010),action,When a San Francisco detective goes hunting f...
3,Bui doi Cho Lon (2013),action,"A man name Lam picks up his girlfriend, but i..."
4,Siam Yuth: The Dawn of the Kingdom (2015),action,Thap and Sin are a homeless musician brothers...


In [3]:
dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        5000 non-null   object
 1   Genre        5000 non-null   object
 2   Description  5000 non-null   object
dtypes: object(3)
memory usage: 117.3+ KB


In [4]:
dat["Genre"].value_counts()

Genre
action         1000
comedy         1000
documentary    1000
drama          1000
thriller       1000
Name: count, dtype: int64

### 1.2 Cleanse the Text Data

In [5]:
dat["Description"][0]

' Windup is a very original comedy. We follow Bernie Shaddick from his tragic childhood, through to his present day pursuit of a career as an inventor. Likened to such films as "Raising Arizona" and "The Big Lebowski", gritty cohen-esque humor saturates Bernie Shaddick\'s life, from his encounter with mace at a potential sale, to his murderous mentors posing as his Aunt Audre and Uncle Reg. Bernie is a man who, in the face of rejection and ridicule, just "doesn\'t get it", and perseveres anyways. His good nature gets on the nerves of his cruel co-workers at his commercial real estate day-job. Upholding an almost delusional enthusiasm, Bernie unknowingly sells his soul with a smile! Bernie\'s girlfriend Latrice and roommate Stuart only accentuate Bernie\'s seemingly pathetic existence. Latrice, sexy in a dirty sort of way, loves Bernie, although she would prefer to love his money, if he had any. Stuart, usually clad in underpants and obsessed with the rubix-cube, lives with Bernie becau

In [6]:
# remove symbols, non-alphanumeric chars
def pre_process(text: str) -> str:
    text = text.lower()
    text = re.sub("(\\d|\\W|_)+", " ", text)

    return text

def get_stopwords(filepath: str) -> frozenset[str]:
    with open(filepath, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)


In [7]:
stopwords = list(get_stopwords("stopwords.txt"))

# text corpus
descriptions = dat["Description"].apply(lambda x: pre_process(x))

In [8]:
descriptions[0]

' windup is a very original comedy we follow bernie shaddick from his tragic childhood through to his present day pursuit of a career as an inventor likened to such films as raising arizona and the big lebowski gritty cohen esque humor saturates bernie shaddick s life from his encounter with mace at a potential sale to his murderous mentors posing as his aunt audre and uncle reg bernie is a man who in the face of rejection and ridicule just doesn t get it and perseveres anyways his good nature gets on the nerves of his cruel co workers at his commercial real estate day job upholding an almost delusional enthusiasm bernie unknowingly sells his soul with a smile bernie s girlfriend latrice and roommate stuart only accentuate bernie s seemingly pathetic existence latrice sexy in a dirty sort of way loves bernie although she would prefer to love his money if he had any stuart usually clad in underpants and obsessed with the rubix cube lives with bernie because it allows him to pursue his b

In [9]:
descriptions.shape

(5000,)

### 1.3 Bag-of-Word

In [10]:
count_vect = CountVectorizer(max_df=0.15,
                             stop_words=stopwords,
                             max_features=5000)

desc_counts = count_vect.fit_transform(descriptions.to_list())

In [11]:
desc_counts.shape

(5000, 5000)

In [12]:
count_vect.vocabulary_

{'original': 3155,
 'comedy': 873,
 'follow': 1789,
 'bernie': 443,
 'tragic': 4584,
 'childhood': 756,
 'day': 1134,
 'pursuit': 3520,
 'career': 643,
 'films': 1745,
 'raising': 3566,
 'arizona': 245,
 'gritty': 1973,
 'humor': 2195,
 'encounter': 1486,
 'mace': 2738,
 'potential': 3378,
 'sale': 3858,
 'murderous': 2980,
 'aunt': 330,
 'uncle': 4677,
 'doesn': 1332,
 'nature': 3022,
 'cruel': 1072,
 'co': 833,
 'workers': 4955,
 'commercial': 885,
 'real': 3606,
 'estate': 1553,
 'job': 2442,
 'enthusiasm': 1523,
 'unknowingly': 4712,
 'sells': 3947,
 'soul': 4155,
 'girlfriend': 1920,
 'roommate': 3811,
 'stuart': 4289,
 'seemingly': 3938,
 'existence': 1593,
 'sexy': 3999,
 'dirty': 1292,
 'sort': 4153,
 'loves': 2713,
 'money': 2934,
 'usually': 4740,
 'obsessed': 3094,
 'lives': 2670,
 'allows': 150,
 'pursue': 3516,
 'beer': 411,
 'check': 746,
 'flashbacks': 1766,
 'eccentric': 1417,
 'learn': 2599,
 'strange': 4264,
 'habit': 2004,
 'murdering': 2979,
 'delivery': 1182,
 'boy

In [13]:
sum_words = desc_counts.sum(axis = 0)

# items in sum_words can be accessed with sum_words[0, idx]
# where idx is the index of the accessed item
sum_words

matrix([[25, 16, 19, ..., 59, 74, 28]])

In [14]:
# store in tuples (word, word count)
frequencies = [(word, sum_words[0, idx]) for word, idx in count_vect.vocabulary_.items()]

# sort based on word, reverse for descending order
frequencies = sorted(frequencies, key = lambda x: x[1], reverse = True)

print("Highest freq words: ")
frequencies[0:20]

Highest freq words: 


[('people', 952),
 ('father', 861),
 ('own', 842),
 ('lives', 838),
 ('day', 819),
 ('friends', 765),
 ('mother', 726),
 ('wife', 720),
 ('home', 718),
 ('son', 694),
 ('police', 658),
 ('woman', 650),
 ('help', 648),
 ('friend', 640),
 ('takes', 636),
 ('death', 620),
 ('girl', 618),
 ('city', 616),
 ('town', 598),
 ('school', 586)]

In [15]:
# get all unique words identified by CountVectorizer tokenization
feature_names = np.array(count_vect.get_feature_names_out())
len(feature_names)

5000

In [16]:
pd.DataFrame(desc_counts.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 1.4 TF-IDF

In [17]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf = True)

desc_tfidf = tfidf_transformer.fit_transform(desc_counts)

In [18]:
tfidf_transformer.idf_

array([7.1194979 , 7.7256337 , 6.57295419, ..., 6.05165727, 5.82851372,
       6.5216609 ])

In [19]:
tfidf_transformer.idf_.shape

(5000,)

In [20]:
idf_sorted = np.argsort(tfidf_transformer.idf_)
print("Features with lowest idf:\n{}".format(
       feature_names[idf_sorted[:100]]))

Features with lowest idf:
['own' 'people' 'lives' 'day' 'takes' 'help' 'friends' 'father' 'home'
 'wife' 'woman' 'friend' 'death' 'soon' 'son' 'mother' 'girl' 'city'
 'police' 'goes' 'comes' 'daughter' 'night' 'town' 'live' 'decides'
 'documentary' 'past' 'local' 'house' 'set' 'school' 'brother' 'begins'
 'living' 'series' 'meets' 'journey' 'money' 'women' 'killed' 'real'
 'named' 'job' 'fight' 'left' 'war' 'tries' 'movie' 'dead' 'husband'
 'trying' 'american' 'secret' 'true' 'little' 'country' 'former' 'meet'
 'follows' 'relationship' 'beautiful' 'evil' 'sister' 'children' 'finally'
 'murder' 'save' 'look' 'human' 'called' 'history' 'tells' 'business'
 'mysterious' 'start' 'events' 're' 'themselves' 'team' 'lost' 'makes'
 'starts' 'power' 'days' 'personal' 'discovers' 'bring' 'falls' 'stories'
 'run' 'escape' 'parents' 'stop' 'boy' 'truth' 'kill' 'found' 'forced'
 'head']


In [21]:
pd.DataFrame(desc_tfidf.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 2. Text Data Understanding

### 2.1 Extract keywords using TF-IDF matrix

In [22]:
descriptions[0]

' windup is a very original comedy we follow bernie shaddick from his tragic childhood through to his present day pursuit of a career as an inventor likened to such films as raising arizona and the big lebowski gritty cohen esque humor saturates bernie shaddick s life from his encounter with mace at a potential sale to his murderous mentors posing as his aunt audre and uncle reg bernie is a man who in the face of rejection and ridicule just doesn t get it and perseveres anyways his good nature gets on the nerves of his cruel co workers at his commercial real estate day job upholding an almost delusional enthusiasm bernie unknowingly sells his soul with a smile bernie s girlfriend latrice and roommate stuart only accentuate bernie s seemingly pathetic existence latrice sexy in a dirty sort of way loves bernie although she would prefer to love his money if he had any stuart usually clad in underpants and obsessed with the rubix cube lives with bernie because it allows him to pursue his b

In [None]:
temp = pd.DataFrame(zip(
    (desc_tfidf.tocoo().col,
     desc_tfidf.tocoo().data)
))

In [24]:
dat['cleaned_review'] = descriptions
dat['keywords'] = results

dat.head()

  right=ast.Str(s=sentinel),
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)
  right=ast.Str(s=sentinel),
  return Constant(*args, **kwargs)


NameError: name 'results' is not defined

In [None]:
dat.to_csv("descriptions_export.csv")

### 2.2 Association Rules Mining on keywords

# Problem2

## 3. Classification Modeling

### 3.1 Sample the data

### 3.2 Build the Model(s)

### 3.3 Evaluate and Improve the Model(s)