In [8]:
import pandas as pd

In [9]:
train=pd.read_csv('../../dataset/movie-genre-classification/train_data.txt', sep=':::',engine='python', names=["ID", 'Title', 'Genre', 'Description'], header=None, index_col=0)
train.index = train.index.astype(int)
train.columns=train.columns.astype(str)
train

Unnamed: 0_level_0,Title,Genre,Description
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...
54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...


In [10]:
test=pd.read_csv('../../dataset/movie-genre-classification/test_data.txt', sep=':::', engine='python', header=None, index_col=0, names=['ID', 'Title', 'Description'])
test

Unnamed: 0_level_0,Title,Description
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
5,Er nu zhai (1955),Before he was known internationally as a mart...
...,...,...
54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da..."
54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...
54198,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...
54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard..."


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
#performed tf-idf here 
tf=TfidfVectorizer(min_df=15, max_df=0.8,stop_words='english', ngram_range=(1, 1),sublinear_tf=True,norm='l2')
X=train.drop('Genre', axis=1)
y=train['Genre']
# combined the features inorder for the model to use them both
X['combined']= X['Title'] + ' ' + X['Description']
X=X['combined']

from sklearn.model_selection import train_test_split
train_x, val_x, train_y, val_y=train_test_split(X, y, test_size=0.2, random_state=0)

x_train_tf=tf.fit_transform(train_x)
x_val_tf=tf.transform(val_x)



from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
#model=MultinomialNB(alpha=0.5) # tried Naive bayes but i didnt find it to be better
model=LogisticRegression(max_iter=300, penalty='l2', C=1)
model.fit(x_train_tf,train_y)

In [23]:

val_predictions = model.predict(x_val_tf)

val_predictions_df = pd.DataFrame(val_x.copy() ) # i wanted to test it on the validation data before going ahead and predicting with test_data

# added this dataframe because it would be easier for us to see the predictions with the features
val_predictions_df['predicted'] = val_predictions

print(val_predictions_df[['combined', 'predicted']].head())

# i am saving the DataFrame to a CSV file so i could check the output 
val_predictions_df.to_csv('val_predictions.csv', index=False)


                                                combined      predicted
ID                                                                     
24144   Blacklist: Recovering the Life of Canada Lee ...   documentary 
28270   Dominó: agarrado por la crisis (2014)   Luis,...        comedy 
10686   One Mission (2010)   How different was the Wo...   documentary 
31405   OUTATIME: Saving the DeLorean Time Machine (2...   documentary 
32895   The Return of Eve (1916)   Believing that ove...         drama 


In [24]:
#in here i am checking the accuracy of the validation data
from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(val_y, val_predictions))

Accuracy: 0.6012173752651481


In [25]:
# i used cross validation because i couldnt trust the model. first it was overfitting then it was underfitting
from sklearn.model_selection import cross_val_score
score=cross_val_score(model, x_train_tf, train_y, cv=5)
print(f'cross val scores ', score)
print(f'average cross val score', score.mean())

cross val scores  [0.59158501 0.58946276 0.58300669 0.58669587 0.58496657]
average cross val score 0.5871433805622687


In [26]:
# to see how the model did on training data
model.score(x_train_tf, train_y)

0.7135874201655484

In [27]:
# to see how the model did on validation data
model.score(x_val_tf, val_y)

0.6012173752651481

In [28]:
# trying with the real test data here after transforming the test data 
test_x=test['Title'] + ' ' + test['Description']
transformed_test=tf.transform(test_x)
predictions=model.predict(transformed_test)
pred_df=pd.DataFrame(test_x.copy())
pred_df['predictions']=predictions
pred_df


Unnamed: 0_level_0,0,predictions
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Edgar's Lunch (1998) L.R. Brane loves his l...,comedy
2,"La guerra de papá (1977) Spain, March 1964:...",drama
3,Off the Beaten Track (2010) One year in the...,documentary
4,"Meu Amigo Hindu (2015) His father has died,...",drama
5,Er nu zhai (1955) Before he was known inter...,drama
...,...,...
54196,"""Tales of Light & Dark"" (2013) Covering mul...",drama
54197,Der letzte Mohikaner (1965) As Alice and Co...,drama
54198,Oliver Twink (2007) A movie 169 years in th...,comedy
54199,"Slipstream (1973) Popular, but mysterious r...",drama


In [18]:
# loaded the test_data_solution so i could compare 
solution_data=pd.read_csv('../../dataset/movie-genre-classification/test_data_solution.txt', sep=':::', engine='python', header=None, index_col=0, names=['Title', 'Genre', 'Description'])
solution_data

Unnamed: 0,Title,Genre,Description
1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
5,Er nu zhai (1955),drama,Before he was known internationally as a mart...
...,...,...,...
54196,"""Tales of Light & Dark"" (2013)",horror,"Covering multiple genres, Tales of Light & Da..."
54197,Der letzte Mohikaner (1965),western,As Alice and Cora Munro attempt to find their...
54198,Oliver Twink (2007),adult,A movie 169 years in the making. Oliver Twist...
54199,Slipstream (1973),drama,"Popular, but mysterious rock D.J Mike Mallard..."


In [29]:
# the accuracy score on the real test data
true_label=solution_data['Genre']
from sklearn.metrics import accuracy_score
acc=accuracy_score(predictions, true_label)
acc

0.5932103321033211