In [None]:
# import the required packages
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns

import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# tokenization
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# classification modelling
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

In [None]:
nltk.data.path

In [None]:
# download required datasets to train NLTK models
nltk.download("punkt")
nltk.download("punkt_tab")

nltk.download("wordnet")
nltk.download("omw-1.4")
nltk.download("averaged_perceptron_tagger_eng")

### 1.1 Text data pre-processing

In [None]:
# load text data
dat = pd.read_csv("AA_movie_train_data.csv")
dat.head()

In [None]:
dat.info()

In [None]:
dat["Genre"].value_counts()

### 1.2 Cleanse text data

In [None]:
def pre_process(text: str) -> str:
    text = text.lower()
    text = re.sub('[^A-Za-z]', " ", text)
    return text

In [None]:
def get_stopwords(filepath: str) -> frozenset[str]:
    with open(filepath, 'r') as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

In [None]:
# apply text cleaning on descriptions column
descriptions = dat["Description"].dropna().astype(str)
descriptions_all = descriptions.apply(pre_process)
tokenized_desc = [word_tokenize(sentence) for sentence in descriptions_all]

In [None]:
print(tokenized_desc[0])

In [None]:
# remove stopwords

stopwords = list(get_stopwords("stopwords.txt"))

filtered_desc = [
    [word for word in sentence if word not in stopwords]
    for sentence in tokenized_desc
]

print(dat["Description"][0])
print(filtered_desc[0])

In [None]:
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    return {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }.get(tag, wordnet.NOUN)

# Apply lemmatization with POS tagging
lemmatized_desc = [
    [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in sentence]
    for sentence in filtered_desc
]

print(lemmatized_desc[0])

### 1.3 Bag of word

In [None]:
# convert keywords back into str for bow
cleaned_texts = [" ".join(tokens) for tokens in filtered_desc]

In [None]:
# vectorizer
vectorizer = CountVectorizer(max_df=0.8, max_features=5000)
bow_matrix = vectorizer.fit_transform(cleaned_texts)

In [None]:
# extracting word frequencies
sum_words = bow_matrix.sum(axis=0)

bow_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
bow_freq = sorted(bow_freq, key=lambda x: x[1], reverse=True)

print("Highest freq words: ")
bow_freq[0:20]

In [None]:
feature_names = np.array(vectorizer.get_feature_names_out())

len(vectorizer.get_feature_names_out())

In [None]:
bow_matrix.shape

In [None]:
bow_df = pd.DataFrame(
    bow_matrix.toarray(),
    columns = vectorizer.get_feature_names_out()
)

bow_df.head()

### 1.4 TF-IDF

In [None]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)

tfidf_matrix = tfidf_transformer.fit_transform(bow_df)

In [None]:
tfidf_transformer.idf_.shape

In [None]:
sorted_by_tfidf = np.argsort(tfidf_transformer.idf_)
print("Features with lowest idf:\n{}".format(
       feature_names[sorted_by_tfidf[:100]]))

In [None]:
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf_transformer.get_feature_names_out()
)

tfidf_df.head()

In [None]:
max_value = tfidf_matrix.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()

print("Features with lowest tfidf:\n{}".format(
      feature_names[sorted_by_tfidf[:20]]))
print("Features with highest tfidf: \n{}".format(
      feature_names[sorted_by_tfidf[-20:]]))

### 2.1 Extracting keywords from TF-IDF matrix

In [None]:
idx = 0
doc = descriptions_all[idx]
print(doc)

tf_idf_vector = tfidf_matrix[idx]
print(tf_idf_vector.shape)

In [None]:
temp = pd.DataFrame(zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data),columns=['feature_number','tf_idf'])
temp

In [None]:
temp.sort_values('tf_idf', ascending = False, inplace = True)
temp

In [None]:
#use only topn items from vector

topn = 10   
topn_items = temp[:topn]

tf_idf = []
word = []

for index, row in topn_items.iterrows():
    fname = feature_names[int(row['feature_number'])]
    word.append(fname)
    tf_idf.append(round(row['tf_idf'], 3))    

print(doc, '\n')

result = dict(zip(word, tf_idf))
print(result)

In [None]:
# extract keywords for all reviews

topn = 10
first_results = []

for idx, doc in descriptions_all.items():
    #generate tf-idf for the given document
    tf_idf_vector = tfidf_matrix[idx]
    
    temp = pd.DataFrame(zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data), columns=['feature_number','tf_idf'])
    temp.sort_values('tf_idf', ascending = False, inplace = True)
    
    #use only topn items from vector
     
    topn_items = temp[:topn]

    tf_idf = []
    word = []

    for index, row in topn_items.iterrows():
        #print(int(row['feature_number']))
        fname = feature_names[int(row['feature_number'])]
        word.append(fname)
        tf_idf.append(round(row['tf_idf'], 3))

    result = dict(zip(word, tf_idf))
    first_results.append(result)


In [None]:
len(first_results)

In [None]:
dat["cleaned_desc"] = descriptions_all
dat["keywords"] = first_results
dat.head()

In [None]:
dat.to_csv("descriptions_export.csv")

In [None]:
# export lemmatized_desc to csv for ARM

lemmatized_df = pd.DataFrame(lemmatized_desc)
lemmatized_df.to_csv("lemmatized_desc.csv", index=False, header=False)

### 2.2 Association Rule Mining

### Generating association rules

In [None]:
desc_data = pd.read_csv("lemmatized_desc.csv", header=None)
desc_data.head()

In [None]:
desc_data.shape

In [None]:
full_list=pd.Series([])
for col in desc_data:
    full_list = full_list._append(desc_data[col].dropna())

print(full_list)

In [None]:
from wordcloud import WordCloud

plt.rcParams['figure.figsize'] = (10, 10)
wordcloud = WordCloud(background_color = 'white', width = 1200,  height = 1200, max_words = 121).generate(str(full_list))
plt.imshow(wordcloud)
plt.axis('off')
plt.title('Most Popular Items',fontsize = 20)
plt.show()

In [None]:
full_list.value_counts()

In [None]:
# looking at the frequency of most popular items 
plt.figure(figsize=(18,7))
full_list.value_counts().head(50).plot.bar()
plt.title('frequency of most popular items', fontsize = 20)
plt.xticks(rotation = 90 )
plt.grid()
plt.show()

In [None]:
y = full_list.value_counts().head(50).to_frame()
y.index

In [None]:
# making each customers shopping items an identical list
trans = []
for i in range(0, 5000):
    trans.append([str(desc_data.values[i,j]) for j in range(0, 20)])

# conveting it into an numpy array
trans = np.array(trans)

# checking the shape of the array
print(trans.shape)

In [None]:
print(trans)

In [None]:
#Transforms the input dataset (a Python list of lists) into a one-hot encoded NumPy boolean array
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
data_encoded = te.fit_transform(trans)
data_encoded = pd.DataFrame(data_encoded, columns = te.columns_)

# getting the shape of the data
data_encoded.shape

In [None]:
data_encoded

In [None]:
data_encoded = data_encoded.loc[:, y.index]

# checking the shape
data_encoded.shape

In [None]:
data_encoded

In [None]:
#First, let us return the items and itemsets with at least 1% support:
frequent_itemsets = apriori(data_encoded, min_support = 0.01, use_colnames = True)
frequent_itemsets

In [None]:
frequent_itemsets[['support']].describe()

In [None]:
rules_l = association_rules(frequent_itemsets, metric="lift", min_threshold=1)
rules_l

In [None]:
rules_l.sort_values('lift', ascending = False, inplace = True)
rules_l.head(10)

In [None]:
rules_l[['lift']].boxplot()

### BOW MODELLING.



In [None]:
# check class distribution
print("Genre distribution:")
print(dat["Genre"].value_counts())
print(f"\nTotal samples: {len(dat)}")

In [None]:
# remove rows with missing genre or description

model_data = dat.dropna(subset=['Description', 'Genre']).copy()
print(f"Data after removing NaN: {len(model_data)} samples")

valid_indices = model_data.index

#### BOW models first
- LR
- RFC
- SVC
- GNB

In [None]:
X = bow_matrix
y = model_data["Genre"].values

X_dense = X.toarray()

In [None]:
# DO NOT FORGET TO SCALE VALUES

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_dense)

In [None]:
# encode genre labels

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y_encoded, 
    train_size=(.60), 
    random_state=42, 
)

#### 1st round of training BOW models

In [None]:
# define base model hyperparams

first_models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(random_state=42),
    'Naive Bayes': GaussianNB(),
}

first_results = {}

for name, model in first_models.items():
    print(f"Training {model}")

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    cv_score = cross_val_score(model, X_train, y_train, cv=5)

    first_results[name] = {
        "model": model,
        "accuracy": accuracy,
        "cv_mean": cv_score.mean(),
        "cv_std": cv_score.std(),
        "predictions": y_pred
    }

    print(f"Accuracy: {accuracy:.4f}\n")


In [None]:
results_df = pd.DataFrame({
    'Model': first_results.keys(),
    'Test Accuracy': [first_results[model]['accuracy'] for model in first_results.keys()],
    'CV Mean': [first_results[model]['cv_mean'] for model in first_results.keys()],
    'CV Std': [first_results[model]['cv_std'] for model in first_results.keys()]
})

results_df = results_df.sort_values('Test Accuracy', ascending=False)
print("\nModel Performance Summary:\n")
print(results_df)

In [None]:
best_model_name = results_df.iloc[0]['Model']
best_model = first_results[best_model_name]['model']
best_predictions = first_results[best_model_name]['predictions']

print(f"Best Model: {best_model_name}")
print(f"Test Accuracy: {first_results[best_model_name]['accuracy']:.4f}")

In [None]:
print("\nClassification Report:") 

report = classification_report(
    y_test, 
    best_predictions,
    target_names=label_encoder.classes_,
    output_dict=True
)
print(classification_report(y_test, best_predictions, target_names=label_encoder.classes_))

In [None]:
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_, 
            yticklabels=label_encoder.classes_)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

#### GridSearchCV for BOW LR

In [None]:
# since LR gave back best results, we tune for LR
from sklearn.metrics import make_scorer

# need to specify valid combinations
# lbfgs doesnt support L1 penalty, might raise internal error
param_grid = [
    { 'penalty': ['l2'], 'solver': ['lbfgs'], 'C': [0.1, 1, 10] },
    { 'penalty': ['l1', 'l2'], 'solver': ['saga'], 'C': [0.1, 1, 10] }
]

# custom f1 scorer for gridsearch
f1_scorer = make_scorer(f1_score, average="macro")

In [None]:
tuning_model = LogisticRegression(random_state=42, max_iter=500)

grid_search = GridSearchCV(
    tuning_model, 
    param_grid, 
    cv=5, 
    scoring=f1_scorer,
    n_jobs=-1   # use all processors
)

grid_search.fit(X_train, y_train)