In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import seaborn as sns

In [6]:
df = pd.read_csv("train_data.txt", header=None, sep=':::',names=["Title","Genre","Description"],engine='python')
df.head()

Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [8]:
def remove_punc(text):
    import string as st
    punc = st.punctuation
    new_word = ''.join([char for char in text if char not in punc])
    return new_word

df['Description'] = df['Description'].apply(lambda x: remove_punc(x))
df["Description"] = df["Description"].apply(lambda x: x.lower())
df.head()

Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,listening in to a conversation between his do...
2,Cupid (1997),thriller,a brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,as the bus empties the students for their fie...
4,The Secret Sin (1915),drama,to help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,the films title refers not only to the unreco...


In [10]:
def toknize(text):
    from nltk.tokenize import word_tokenize
    if isinstance(text, list):  # If it's a list, join it into a single string
        text = ' '.join(text)
    new_text = word_tokenize(text)
    return new_text
df['Description']=df['Description'].apply(lambda x: toknize(x))
df.head()

Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,"[listening, in, to, a, conversation, between, ..."
2,Cupid (1997),thriller,"[a, brother, and, sister, with, a, past, inces..."
3,"Young, Wild and Wonderful (1980)",adult,"[as, the, bus, empties, the, students, for, th..."
4,The Secret Sin (1915),drama,"[to, help, their, unemployed, father, make, en..."
5,The Unrecovered (2007),drama,"[the, films, title, refers, not, only, to, the..."


In [12]:
from nltk.stem import WordNetLemmatizer
nltk.download("wordnet")
nltk.download("omw-1.4")
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    new_word = " ".join([lemmatizer.lemmatize(word) for word in text])
    return new_word

df['Description'] = df['Description'].apply(lambda x: lemmatize(x))
df.head()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,listening in to a conversation between his doc...
2,Cupid (1997),thriller,a brother and sister with a past incestuous re...
3,"Young, Wild and Wonderful (1980)",adult,a the bus empty the student for their field tr...
4,The Secret Sin (1915),drama,to help their unemployed father make end meet ...
5,The Unrecovered (2007),drama,the film title refers not only to the unrecove...


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf = TfidfVectorizer(max_features=5000,stop_words='english',ngram_range=(1,2))
Description_tf_idf = tf_idf.fit_transform(df['Description'])
Description_tf_idf = pd.DataFrame(Description_tf_idf.toarray())
Description_tf_idf.columns = tf_idf.get_feature_names_out()

In [27]:
Description_tf_idf

Unnamed: 0,10,10 year,100,1000,10000,11,12,12 year,13,14,...,youngster,youre,youth,youtube,youve,zealand,zero,zombie,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.176114,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.207057,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54209,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
54210,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
54211,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
54212,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [29]:
df['Description len']=df['Description'].apply(lambda x:len(x)-x.count(' '))
df['Word_Count'] = df['Description'].apply(lambda x: len(x.split()))
df['Average_Word_Length'] = df['Description'].apply(lambda x: np.mean([len(word) for word in x.split()]))
from textblob import TextBlob
df['Sentiment'] = df['Description'].apply(lambda x: TextBlob(x).sentiment.polarity)
df.head()

Unnamed: 0,Title,Genre,Description,Description len,Word_Count,Average_Word_Length,Sentiment
1,Oscar et la dame rose (2009),drama,listening in to a conversation between his doc...,423,92,4.597826,0.192975
2,Cupid (1997),thriller,a brother and sister with a past incestuous re...,148,32,4.625,-0.125
3,"Young, Wild and Wonderful (1980)",adult,a the bus empty the student for their field tr...,499,113,4.415929,0.325
4,The Secret Sin (1915),drama,to help their unemployed father make end meet ...,846,191,4.429319,0.029545
5,The Unrecovered (2007),drama,the film title refers not only to the unrecove...,501,106,4.726415,0.145536


In [31]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
df['Description_len_sclr'] = scaler.fit_transform(df['Description len'].values.reshape(-1, 1))
df.head(5)

Unnamed: 0,Title,Genre,Description,Description len,Word_Count,Average_Word_Length,Sentiment,Description_len_sclr
1,Oscar et la dame rose (2009),drama,listening in to a conversation between his doc...,423,92,4.597826,0.192975,0.047852
2,Cupid (1997),thriller,a brother and sister with a past incestuous re...,148,32,4.625,-0.125,0.014197
3,"Young, Wild and Wonderful (1980)",adult,a the bus empty the student for their field tr...,499,113,4.415929,0.325,0.057153
4,The Secret Sin (1915),drama,to help their unemployed father make end meet ...,846,191,4.429319,0.029545,0.099621
5,The Unrecovered (2007),drama,the film title refers not only to the unrecove...,501,106,4.726415,0.145536,0.057398


In [33]:
target = df['Genre']
features = df.drop(['Genre','Title','Description','Description len'],axis=1)
features

Unnamed: 0,Word_Count,Average_Word_Length,Sentiment,Description_len_sclr
1,92,4.597826,0.192975,0.047852
2,32,4.625000,-0.125000,0.014197
3,113,4.415929,0.325000,0.057153
4,191,4.429319,0.029545,0.099621
5,106,4.726415,0.145536,0.057398
...,...,...,...,...
54210,85,4.576471,0.159091,0.043691
54211,130,4.746154,0.135000,0.071595
54212,48,4.083333,0.125000,0.020071
54213,120,4.166667,-0.062727,0.057276


In [37]:
features.reset_index(drop=True, inplace=True)
Description_tf_idf.reset_index(drop=True, inplace=True)

# Concatenate the two DataFrames along columns
feature = pd.concat([features, Description_tf_idf], axis=1)

In [41]:
feature

Unnamed: 0,Word_Count,Average_Word_Length,Sentiment,Description_len_sclr,10,10 year,100,1000,10000,11,...,youngster,youre,youth,youtube,youve,zealand,zero,zombie,zone,zoo
0,92,4.597826,0.192975,0.047852,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
1,32,4.625000,-0.125000,0.014197,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
2,113,4.415929,0.325000,0.057153,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
3,191,4.429319,0.029545,0.099621,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
4,106,4.726415,0.145536,0.057398,0.0,0.0,0.0,0.0,0.0,0.176114,...,0.0,0.0,0.0,0.0,0.0,0.0,0.207057,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54209,85,4.576471,0.159091,0.043691,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
54210,130,4.746154,0.135000,0.071595,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
54211,48,4.083333,0.125000,0.020071,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0
54212,120,4.166667,-0.062727,0.057276,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0


In [49]:
from sklearn.model_selection import train_test_split
feature.columns = feature.columns.astype(str)
x_train, x_test, y_train, y_test = train_test_split(feature,target,test_size=0.2)

In [52]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import KFold, cross_val_score
from xgboost import XGBClassifier

In [None]:
from sklearn.ensemble import VotingClassifier
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100)),
        ('lr', LogisticRegression(max_iter=500)),
        ('xgb', XGBClassifier(n_estimators=100, max_depth=5))
    ],
    voting='hard'
)
ensemble_model.fit(x_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
