<a href="https://colab.research.google.com/github/saichethanr/CodSoft/blob/main/Movie_genre_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie Genre classification

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the Dataset

In [None]:
train_df = pd.read_csv(
    'train_data.txt',
    sep=" ::: ",
    names=['id', 'title', 'genre', 'description'],
    engine='python',
    skipinitialspace = True)

In [None]:
train_df['genre'].unique()

array(['drama', 'thriller', 'adult', 'documentary', 'comedy', 'crime',
       'reality-tv', 'horror', 'sport', 'animation', 'action', 'fantasy',
       'short', 'sci-fi', 'music', 'adventure', 'talk-show', 'western',
       'family', 'mystery', 'history', 'news', 'biography', 'romance',
       'game-show', 'musical', 'war'], dtype=object)

In [None]:
import plotly.express as px
genre = train_df['genre'].value_counts()
fig = px.bar(genre, x=genre.index, y=genre, title='Genre Distribution')
fig.update_layout(xaxis_title='Genre', yaxis_title='Count')
fig.show()

## Cleaning the texts

In [None]:
import re
import nltk
import string
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
stopword = set(stopwords.words('english'))
nltk.download('punkt')
nltk.download('wordnet')

def clean_text(text):

    words = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha() and word.lower() not in stopword]
    cleaned_text = ' '.join(words)

    return cleaned_text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
train_df['description_cleaned'] = train_df['description'].apply(clean_text)

In [None]:
def make_genre_groups(df):
    df.genre.mask((df['genre'] == "biography") | (df['genre'] == "history"), "documentary", inplace=True)

    df.genre.mask((df['genre'] == "mystery") | (df['genre'] == "horror"), "thriller", inplace=True)

    df.genre.mask((df['genre'] == "adventure") | (df['genre'] == "sci-fi") | (df['genre'] == "fantasy"),
                  "action", inplace=True)

    df.genre.mask((df['genre'] == "romance"), "comedy", inplace=True)

    df.genre.mask((df['genre'] == "short") | (df['genre'] == "western") | (df['genre'] == "reality-tv") |
                        (df['genre'] == "family") | (df['genre'] == "music") | (df['genre'] == "adult") |
                        (df['genre'] == "crime") | (df['genre'] == "animation") | (df['genre'] == "sport") |
                        (df['genre'] == "talk-show") | (df['genre'] == "musical") | (df['genre'] == "game-show") |
                        (df['genre'] == "news") | (df['genre'] == "war"), "other", inplace=True)
    return df

In [None]:
train_df = make_genre_groups(train_df.copy())

In [None]:
genre = train_df['genre'].value_counts()
fig = px.bar(genre, x=genre.index, y=genre, title='Genre Distribution', color=genre.index)
fig.update_layout(xaxis_title='Genre', yaxis_title='Count')
fig.show()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=2, max_df=0.95, ngram_range=(1, 3), max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(train_df.description_cleaned)
tfidf_vectorizer.get_feature_names_out()[:10]

array(['aaron', 'abandon', 'abandoned', 'abducted', 'ability', 'able',
       'aboard', 'abortion', 'abroad', 'absence'], dtype=object)

In [None]:
train_tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(train_df['genre'])

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_tfidf_df, y, test_size=0.3, random_state=42)

## Training the Naive Bayes model on the Training set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [None]:
y_pred1 = classifier.predict(X_test)


## Making the Confusion Matrix

In [None]:

from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
cm = confusion_matrix(y_test, y_pred1)
print(cm)
accuracy_score(y_test, y_pred1)
print(classification_report(y_pred1,y_test))

[[ 401   15   36   36   28  103]
 [ 377  529   84  167  111  326]
 [ 455   80 1747  145  183  155]
 [ 704  308  205  692  114  674]
 [ 708  246  487  138  428  365]
 [ 216   34   24   58   28  436]]
              precision    recall  f1-score   support

           0       0.65      0.14      0.23      2861
           1       0.33      0.44      0.38      1212
           2       0.63      0.68      0.65      2583
           3       0.26      0.56      0.35      1236
           4       0.18      0.48      0.26       892
           5       0.55      0.21      0.31      2059

    accuracy                           0.39     10843
   macro avg       0.43      0.42      0.36     10843
weighted avg       0.51      0.39      0.38     10843



# Training the logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state = 0)
lr.fit(X_train, y_train)


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression



# confusion matrix for logistic regression

In [None]:
y_pred2 = lr.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
cm = confusion_matrix(y_test, y_pred2)
print(cm)
print(accuracy_score(y_test, y_pred2))
print(classification_report(y_pred2,y_test))

[[ 200   33   62  132  138   54]
 [  13  816   49  413  278   25]
 [  11   34 2184  222  297   17]
 [  19  259  196 1911  259   53]
 [  60  212  410  397 1237   56]
 [  29   30   37  186   97  417]]
              precision    recall  f1-score   support

           0       0.32      0.60      0.42       332
           1       0.51      0.59      0.55      1384
           2       0.79      0.74      0.77      2938
           3       0.71      0.59      0.64      3261
           4       0.52      0.54      0.53      2306
           5       0.52      0.67      0.59       622

    accuracy                           0.62     10843
   macro avg       0.56      0.62      0.58     10843
weighted avg       0.64      0.62      0.63     10843

