In [1]:
# using text classification using multinomial maive bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [20]:
# loading the dataset
import pandas as pd
mv = pd.read_csv('movie1.csv')
mv.head(4)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,10/30/1995,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,12/15/1995,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,12/22/1995,0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,12/22/1995,81452156,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34


In [38]:
# Splitting the data into training and testing sets
X = mv['title']
y = mv['spoken_languages']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [39]:
# Vectorizing the text data
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [40]:
print(mv.isnull().sum())

adult                       0
belongs_to_collection     966
budget                      0
genres                      0
homepage                 1044
id                          0
imdb_id                     0
original_language           0
original_title              0
overview                    0
popularity                  0
poster_path                 5
production_companies        0
production_countries        0
release_date                2
revenue                     0
runtime                     5
spoken_languages            0
status                      2
tagline                   294
title                       0
video                       0
vote_average                0
vote_count                  0
dtype: int64


In [43]:
# Impute missing values in the  column  with the most frequent value
mv['spoken_languages'].fillna(mv['spoken_languages'].mode()[0], inplace=True)

# ***what is a multinomial naive bayes?***

Multinomial Naive Bayes is a variant of the Naive Bayes algorithm, particularly suitable for classification tasks involving discrete features like word counts in text classification.

# ***Here are the key components:***

# *Naive Bayes:*
Bayes' Theorem: A probabilistic formula to find the probability of a hypothesis given the evidence.
"Naive" Assumption: Independence assumption between features (in text classification, it assumes each word contributes independently to the class).

# *Multinomial Naive Bayes:*
Suited for Text Classification: Commonly used for text classification problems where features are typically word counts or frequencies.


*Works with Discrete Features:* It handles features that represent counts or frequencies of events (like word occurrences).

Probability Computation: Computes probabilities based on the frequency of words in the document and their occurrence in each class.



# How it works in text classification:

# *Text Vectorization:*
Convert text data into numerical form (such as word counts or TF-IDF values).

# *Calculating Probabilities:*
Multinomial Naive Bayes calculates the probabilities of each word given the class (P(word|class)) and the prior probabilities of each class (P(class)).

# *Predictions:*
For a new text sample, the algorithm calculates the probability of it belonging to each class based on the occurrence of words in that sample.

# *Classification:*
The class with the highest probability is assigned to the new text sample.


# Advantages:  

Efficient and easy to implement.

Works well with large feature spaces, like in text classification, where the number of unique words can be large.

Performs well even with relatively small datasets.
# New Section
Limitations:

Naive Bayes assumes independence between features, which
might not be true in some real-world datasets.

Relies on the frequency of occurrences, so it might not capture the semantics or relationships between words.

In essence, Multinomial Naive Bayes is a simple yet effective probabilistic classifier commonly used in text classification tasks due to its efficiency and decent performance, especially with large feature spaces like text data.


In [42]:
# Training the classifier
clf = MultinomialNB()
clf.fit(X_train_vect, y_train)

In [44]:
# Making predictions
predictions = clf.predict(X_test_vect)

In [45]:
# Calculating accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.62


In [46]:
print(accuracy*100,'%')

62.16216216216216 %
