<a href="https://colab.research.google.com/github/rajkumar1325/CODSOFT/blob/main/MovieGenreClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [88]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer #converting text documents into numerical feature vectors
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Loading the dataset

In [89]:
train_data = pd.read_csv('/content/train_data.txt', delimiter=':::', names=['id', 'title', 'genre', 'plot'], engine='python')
test_data = pd.read_csv('/content/test_data.txt', delimiter=':::', names=['id', 'title', 'plot'], engine='python')
test_solution = pd.read_csv('/content/test_data_solution.txt', delimiter=':::', names=['id', 'title', 'genre', 'plot'], engine='python')


# Go-Through the datasets

In [90]:
train_data.head()

Unnamed: 0,id,title,genre,plot
0,1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [91]:
test_data.head()

Unnamed: 0,id,title,plot
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...


In [92]:
test_solution.head()

Unnamed: 0,id,title,genre,plot
0,1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),drama,Before he was known internationally as a mart...


In [93]:
# Counting each genre data
train_data['genre'].value_counts()


Unnamed: 0_level_0,count
genre,Unnamed: 1_level_1
drama,13613
documentary,13096
comedy,7447
short,5073
horror,2204
thriller,1591
action,1315
western,1032
reality-tv,884
family,784


In [94]:
# total number of rows
train_data.value_counts().sum()

54214

# S2:: Preprocess the data

In [95]:
# Convert plots to strings and handle missing values
X_train = train_data['plot'].astype(str).fillna("")  # Ensure all plots are strings
y_train = train_data['genre']  # Genres


In [96]:
# Use the test dataset for evaluation and handle the missing words
X_test = test_data['plot'].astype(str).fillna("")  # Ensure all plots are strings
y_test = test_solution['genre']


# S3:: converting test into numeric
### By using **TF-IDF**{Term Frequency-Inverse Document Frequency}

In [97]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words="english")
# use stopword 'english' to refine the data and remove noises


In [98]:
# Fit and transform the training data, and transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


# S4:: Training using Logistic Regression

In [99]:
# Initialize the classifier
model = LogisticRegression(max_iter=1000, random_state=42)


In [100]:
# Train the model on the training data
model.fit(X_train_tfidf, y_train)


# Prediction of the model

In [101]:
# predictions on the test dataset
y_pred = model.predict(X_test_tfidf)

# Checking Accuracy of the model

In [102]:
accuracy_score(y_test, y_pred)


0.5882103321033211

In [103]:
print(train_data['genre'].value_counts())


genre
drama           13613
documentary     13096
comedy           7447
short            5073
horror           2204
thriller         1591
action           1315
western          1032
reality-tv        884
family            784
adventure         775
music             731
romance           672
sci-fi            647
adult             590
crime             505
animation         498
sport             432
talk-show         391
fantasy           323
mystery           319
musical           277
biography         265
history           243
game-show         194
news              181
war               132
Name: count, dtype: int64


# Classification Report

In [105]:
print(classification_report(y_test, y_pred))


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

      action        0.53      0.26      0.35      1314
       adult        0.62      0.20      0.30       590
   adventure        0.71      0.14      0.24       775
   animation        0.60      0.02      0.05       498
   biography        0.00      0.00      0.00       264
      comedy        0.55      0.59      0.57      7446
       crime        0.43      0.02      0.04       505
 documentary        0.66      0.87      0.75     13096
       drama        0.54      0.80      0.64     13612
      family        0.60      0.08      0.14       783
     fantasy        0.78      0.02      0.04       322
   game-show        0.93      0.48      0.63       193
     history        0.00      0.00      0.00       243
      horror        0.67      0.57      0.61      2204
       music        0.70      0.39      0.50       731
     musical        0.67      0.01      0.01       276
     mystery        1.00      0.00      0.01       318
        n

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Custom Search

In [119]:
def searchGenre(movie_title):

        # Attempt to find the movie in the training dataset
        movie = train_data[train_data['title'].str.contains(movie_title, case=False, na=False)]

        # If the movie is not found in the train dataset, search in the test dataset
        if movie.empty:
            movie = test_data[test_data['title'].str.contains(movie_title, case=False, na=False)]

        # If the movie is found in either dataset (train or test)
        if not movie.empty:

            # Transform the movie plot into the TF-IDF feature space using the fitted vectorizer
            plot_tfidf = vectorizer.transform(movie['plot'])

            # Predict the genre(s) for the given movie using the trained model
            predicted_genres = model.predict(plot_tfidf)

            # priting the result
            print(f"Title: {movie['title'].values[0]}")
            print(f"Predicted Genre: {predicted_genres[0]}")

        else:
            # If the movie is not found in either dataset, print an error message
            print("Movie not found in datasets.")


In [125]:
# Example
movie = "cupid"
searchGenre(movie)


Title:  Cupid (1997) 
Predicted Genre:  thriller 


In [123]:
movie = "La guerra de papá "
searchGenre(movie)

Title:  La guerra de papá (1977) 
Predicted Genre:  drama 


In [124]:
movie = "Off the Beaten Track"
searchGenre(movie)

Title:  Off the Beaten Track (2010) 
Predicted Genre:  documentary 
