# Problem statement

- <h3>Predicting the Rating and Gross Collection of movies from IMDB dataset.</h3>

# Loading the dataset

In [None]:
# Importing the required libraries and data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings

pd.pandas.set_option('display.max_columns',None) # To see all the columns from the dataset

In [None]:
# loading data1
data1 = pd.read_csv('CSV1_normalized.csv')

In [None]:
# Loading data2
data2 = pd.read_csv('CSV2_normalized.csv')

In [None]:
data1.head()

- Note :- In below dataset ratings mentioned as popularity.

In [None]:
data2.head()

In [None]:
# Comparing the values of the 'Name' columns
if data1['Name'].equals(data2['Name']):
    print("The values in both column are the same")
else:
    print("The values in both column are different")

In [None]:
list(data1['Name'].unique())

In [None]:
# Removing leading and trailing spaces
data1['Name'] = data1['Name'].str.strip()

In [None]:
list(data2['Name'].unique())

In [None]:
# Removing leading and trailing spaces
data2['Name'] = data2['Name'].str.strip()

In [None]:
# Comparing the values of the 'Name' columns
if data1['Name'].equals(data2['Name']):
    print("The values in both column are the same")
else:
    print("The values in both column are different")

- We can see After dealing with wide spaces data1 Name column and data2 Name columns contain same values. So we can use Name column to merge both dataset.

# Merging the data1 and data2

In [None]:
df = pd.merge(data1, data2, on = "Name", how = "inner")

In [None]:
df.head()

- Here we use inner join to merge the datasets so we get the data of the movies whose present in both the datasets data1 and data2.  

In [None]:
list(df['Unnamed: 0_x'].unique())

In [None]:
list(df['Unnamed: 0_y'].unique())

- Here we can see both unnamed column has serial number values. We can drop them because it has no use in our dataset. When we stored the data in Excel by default Excel will create serial numbers for the data.

In [None]:
# checking the information to understand the data
df.info()

- We have total 1725 rows and 23 columns. 1 column have datatype float64, 3 colums has datatype int64 and 19 columns has object. We need to change datatype of 'Duration of the movie','Metascore','Votes' and 'Gross Collection' because this columns contain integer and float values.

# Aim :-  Prediction of Gross Collection

In [None]:
df.shape

In [None]:
# Droping the unwanted columns and the columns having maximum None values
df.drop(['Unnamed: 0_x'], axis = 1, inplace = True)
df.drop(['Unnamed: 0_y'], axis = 1, inplace = True)
df.drop(['Director2'], axis = 1 , inplace = True)
df.drop(['Director3'], axis = 1 , inplace = True)
df.drop(['Director4'], axis = 1 , inplace = True)
df.drop(['Director5'], axis = 1 , inplace = True)
df.drop(['Director6'], axis = 1 , inplace = True)

In [None]:
df.isnull().sum()

- We have missing values in 'Metascore','Gross Collection','Genre3','Star2','Star3' and 'Star4' columns.

In [None]:
# Lets see the duplicate values in dataset
df.duplicated().sum()

- There is no duplicate values in our dataset.

In [None]:
list(df['Name'].unique())

In [None]:
df = df.rename(columns={"Duration of the movie" : "Duration in min","Popularity" : "Ratings"})

In [None]:
df.head(1)

## Lets see the unique values in other columns

In [None]:
list(df['Year'].unique())

In [None]:
list(df['Duration in min'].unique())

In [None]:
list(df['Metascore'].unique())

In [None]:
list(df['Director1'].unique())

In [None]:
# Removing leading and trailing spaces
df['Director1'] = df['Director1'].str.strip()

In [None]:
list(df['Votes'].unique())

In [None]:
list(df['Ratings'].unique())

In [None]:
list(df['Certificate'].unique())

- Here we can see duration is stored in some places hence we replacing it by 'Not Rated' values
- '+' is special character but there is some meaning for 13+ or 16+ certificates in different countries. As we don't have country details we can not replce it, similarly for M/PG also.
- https://help.imdb.com/article/contribution/titles/certificates/GU757M8ZJ9ZPXB39?ref_=helpart_nav_27#
- By using above link You can see the certificates in different country which awarded for movies in that country. 

In [None]:
df['Certificate'] = df['Certificate'].replace({'135 min ':'Not Rated', '128 min ':'Not Rated', '107 min ':'Not Rated', '130 min ':'Not Rated', '155 min ':'Not Rated', '85 min ':'Not Rated', '99 min ':'Not Rated', '97 min ':'Not Rated', '125 min ':'Not Rated' })


In [None]:
# Removing leading and trailing spaces 
df['Certificate'] = df['Certificate'].str.strip()

In [None]:
list(df['Genre1'].unique())

In [None]:
list(df['Genre2'].unique())

In [None]:
list(df['Genre3'].unique())

In [None]:
# Removing leading and trailing spaces 
df['Genre1'] = df['Genre1'].str.strip()
df['Genre2'] = df['Genre2'].str.strip()
df['Genre3'] = df['Genre3'].str.strip()

In [None]:
list(df['Star1'].unique())

In [None]:
# Remove special character "
df['Star1'] = df['Star1'].str.replace(r'"', " ", regex = True)

In [None]:
list(df['Star2'].unique())

In [None]:
# Remove special character "
df['Star2'] = df['Star2'].str.replace(r'"', " ", regex = True)

In [None]:
list(df['Star3'].unique())

In [None]:
# Remove special character "
df['Star3'] = df['Star3'].str.replace(r'"', " ", regex = True)

In [None]:
list(df['Star4'].unique())

In [None]:
# Remove special character "
df['Star4'] = df['Star4'].str.replace(r'"', " ", regex = True)

In [None]:
# Removing leading and trailing spaces 
df['Star1'] = df['Star1'].str.strip()
df['Star2'] = df['Star2'].str.strip()
df['Star3'] = df['Star3'].str.strip()
df['Star4'] = df['Star4'].str.strip()

In [None]:
# Remove some special characters from Gross Collection
df['Gross Collection'] = df['Gross Collection'].str.replace(r"$", " ", regex = True) 

In [None]:
df['Gross Collection'] = df['Gross Collection'].str.replace(r"M", " ", regex=True)

In [None]:
df['Gross Collection'] = df['Gross Collection'].apply(pd.to_numeric)

In [None]:
df['Votes'][0]

In [None]:
df['Votes'] = df['Votes'].str.replace(r",", "", regex = True) 

In [None]:
df['Votes'] = df['Votes'].apply(pd.to_numeric)

In [None]:
df['Metascore'][0]

In [None]:
df['Duration in min'] = df['Duration in min'].str.replace(r"min", "", regex = True)

In [None]:
df['Metascore'] = df['Metascore'].str.replace(r"Metascore", " ", regex = True)

In [None]:
# Removing leading and trailing spaces
df['Metascore'] = df['Metascore'].str.strip()

In [None]:
df['Metascore'] = df['Metascore'].apply(pd.to_numeric)
df['Duration in min'] = df['Duration in min'].apply(pd.to_numeric)

In [None]:
# checking for data types of columns
df.dtypes

In [None]:
df.describe()

- We can see Metascore and Gross Collection columns has null values since its count is less than the total number of rows.


- We have a dataset of movies which are relsead from 1924 to 2023.


- In Votes we can see the difference in mean and median values means data is not symmetrically distributed in this column. Here mean is greater then median which impies that our data is skewd towords right.


- A low standard deviation indicates that the values tend to be close to the mean (also called the expected value) of the set, while a high standard deviation indicates that the values are spread out over a wider range. We can see in gross collection and votes column data points are quite spread out from the mean. While in other columns standard deviation indicates that data is close toward mean.


- While observing 75% and maximum there may be outliers present in our dataset. But in Metascore column we can see mean and median is nearly equal means in that column data is symmetrically distributed.

In [None]:
# Lets check for outliers using IQR
# calculate the interquartile range (IQR)
Q1 = df['Year'].quantile(0.25)
Q3 = df['Year'].quantile(0.75)
IQR = Q3 - Q1

# calculate the upper and lower bounds for outliers
upper_bound = Q3 + 1.5*IQR
lower_bound = Q1 - 1.5*IQR

# count the number of outliers
outliers = df.loc[(df['Year'] < lower_bound) | (df['Year'] > upper_bound)]
num_outliers = outliers.shape[0]

# print the number of outliers
print(f"Number of outliers: {num_outliers}")


In [None]:
# calculate the interquartile range (IQR)
Q1 = df['Duration in min'].quantile(0.25)
Q3 = df['Duration in min'].quantile(0.75)
IQR = Q3 - Q1

# calculate the upper and lower bounds for outliers
upper_bound = Q3 + 1.5*IQR
lower_bound = Q1 - 1.5*IQR

# count the number of outliers
outliers = df.loc[(df['Duration in min'] < lower_bound) | (df['Duration in min'] > upper_bound)]
num_outliers = outliers.shape[0]

# print the number of outliers
print(f"Number of outliers: {num_outliers}")


In [None]:
# calculate the interquartile range (IQR)
Q1 = df['Metascore'].quantile(0.25)
Q3 = df['Metascore'].quantile(0.75)
IQR = Q3 - Q1

# calculate the upper and lower bounds for outliers
upper_bound = Q3 + 1.5*IQR
lower_bound = Q1 - 1.5*IQR

# count the number of outliers
outliers = df.loc[(df['Metascore'] < lower_bound) | (df['Metascore'] > upper_bound)]
num_outliers = outliers.shape[0]

# print the number of outliers
print(f"Number of outliers: {num_outliers}")

In [None]:
# calculate the interquartile range (IQR)
Q1 = df['Votes'].quantile(0.25)
Q3 = df['Votes'].quantile(0.75)
IQR = Q3 - Q1

# calculate the upper and lower bounds for outliers
upper_bound = Q3 + 1.5*IQR
lower_bound = Q1 - 1.5*IQR

# count the number of outliers
outliers = df.loc[(df['Votes'] < lower_bound) | (df['Votes'] > upper_bound)]
num_outliers = outliers.shape[0]

# print the number of outliers
print(f"Number of outliers: {num_outliers}")

In [None]:
# calculate the interquartile range (IQR)
Q1 = df['Ratings'].quantile(0.25)
Q3 = df['Ratings'].quantile(0.75)
IQR = Q3 - Q1

# calculate the upper and lower bounds for outliers
upper_bound = Q3 + 1.5*IQR
lower_bound = Q1 - 1.5*IQR

# count the number of outliers
outliers = df.loc[(df['Ratings'] < lower_bound) | (df['Ratings'] > upper_bound)]
num_outliers = outliers.shape[0]

# print the number of outliers
print(f"Number of outliers: {num_outliers}")

In [None]:
# calculate the interquartile range (IQR)
Q1 = df['Gross Collection'].quantile(0.25)
Q3 = df['Gross Collection'].quantile(0.75)
IQR = Q3 - Q1

# calculate the upper and lower bounds for outliers
upper_bound = Q3 + 1.5*IQR
lower_bound = Q1 - 1.5*IQR

# count the number of outliers
outliers = df.loc[(df['Gross Collection'] < lower_bound) | (df['Gross Collection'] > upper_bound)]
num_outliers = outliers.shape[0]

# print the number of outliers
print(f"Number of outliers: {num_outliers}")

- Metascore column has no outliers while Votes and Gross Collection columns has maximum outliers.

# Exploratory data analysis

In [None]:
# Let us see the top 5 voted movies using bivariate analysis
top_voted = df.sort_values(['Votes'], ascending = False)
plt.figure(figsize=(25, 10))
g=sns.barplot(x=top_voted['Name'][:5],y=top_voted['Votes'][:5], palette = 'hls')
g.set_title("Top Voted Movies", weight = "bold")
plt.show()


In [None]:
# Lets see the top 5 voted movies ratings
plt.figure(figsize=(25, 10))
g=sns.barplot(x=top_voted['Name'][:5],y=top_voted['Ratings'][:5], palette = 'husl')
g.set_title("IMDB Rating of top voted movies", weight = "bold")
plt.show()

In [None]:
# Lets see the metascore of this top 5 voted movies
plt.figure(figsize=(25, 10))
g=sns.barplot(x=top_voted['Name'][:5],y=top_voted['Metascore'][:5], palette = 'husl')
g.set_title("Metascore of top rated movies", weight = "bold")
plt.show()

In [None]:
# Lets see the Gross Collection of this top 5 voted movies
plt.figure(figsize=(25, 10))
g=sns.barplot(x=top_voted['Name'][:5],y=top_voted['Gross Collection'][:5], palette = 'husl')
g.set_title("Gross collection by top rated movies", weight = "bold")
plt.show()

- The top voted five movies are 'The Dark Knight', 'Inception', ' The Matrix Name', 'The Lord of the Rings - The Fellowship of the Ring', And 'The Lord of the Rings - The Return of the king'.
- All five top voted movies ratated above 8.
- Also we can see this top voted movies metascore grater than 60. 
- If we see this top voted movies by earnings then the 'The Dark Knight' is in first position. And all other movies has earning greater than 150 Million dollers. 

In [None]:
# Lets see the top movies by highest earnings
highest_earning = df.sort_values(['Gross Collection'], ascending = False)
plt.figure(figsize=(20, 9))
g=sns.barplot(x=highest_earning['Name'][:5],y=highest_earning['Gross Collection'][:5], palette = 'husl')
g.set_title("Movies with highest Gross (earning)", weight = "bold")
plt.show()

In [None]:
# Lets see the top movies by ratings
plt.figure(figsize=(20, 9))
highest_rating = df.sort_values(['Ratings'], ascending = False)
g=sns.barplot(x=highest_rating['Name'][:5],y=highest_rating['Ratings'][:5], palette = 'husl')
g.set_title("Movies with highest highest_rating", weight = "bold")
plt.show()

- 'Star Wars Episode VII The Force Awakens', 'Avengers Endgame', 'Spider Man No Way Home Name', 'Avatar', 'Top Gun Maverick' movies has highest earning but they are not in the list of top voted movies.
- Also we can observe top voted movies gross collection is not more than 600 million dollers but they still in the list of top votes and top rated 5 movies.

In [None]:
# lets see the Maximum movies released years using Univariate analysis
plt.figure(figsize=(20, 9))
g=sns.barplot(x=df['Year'].value_counts()[:10].index,y=df['Year'].value_counts()[:10])
g.set_title("Maximum movies released in-", weight = "bold")
g.set_xlabel("Years")
plt.show()

- In the year of 2016, 2014,2013 and 2011 maximum number of movies are realesed. 

In [None]:
plt.figure(figsize=(20, 9))
g=sns.barplot(x=df['Director1'].value_counts().index[0:10],y=df['Director1'].value_counts()[:10])
g.set_title("Mostly Occurred Directors", weight = "bold")
g.set_xlabel("Directors", weight = "bold")
plt.show()

- Director 'Michael Bay' directed maximum number of movies i.e 13+ movies than other directors. 

In [None]:
# Lets see the Stars with most Occurences in movies using multivariate analysis
stars=['Star1','Star2','Star3','Star4']
fig,axs=plt.subplots(4,1,figsize=(20,7))
ax=0
for x in stars:
    axs[ax].bar(df[x].value_counts()[:10].index,df[x].value_counts()[:10],color = 'Gray')
    axs[ax].set_title(x)
    axs[ax].set_ylabel("Appearances", weight = "bold")
    ax+=1
    plt.tight_layout()

In [None]:
# Lets see the appearances of Stars in Top voted movies
stars=['Star1','Star2','Star3','Star4']
fig,axs=plt.subplots(4,1,figsize=(20,7))
ax=0
for x in stars:
    s=df.groupby([x]).sum().reset_index()
    d=s.sort_values(['Votes'],ascending=False)[:10]
    axs[ax].bar(d[x],d['Votes'],color = 'Gray')
    axs[ax].set_title(x)
    axs[ax].set_ylabel("Appearances", weight = "bold")
    ax+=1
    plt.tight_layout()

In [None]:
# Lets see the appearances of Stars in top grossed movies
stars=['Star1','Star2','Star3','Star4']
fig,axs=plt.subplots(4,1,figsize=(20,7))
ax=0
for x in stars:
    s=df.groupby([x]).sum().reset_index()
    d=s.sort_values(['Gross Collection'],ascending=False)[:10]
    axs[ax].bar(d[x],d['Gross Collection'],color = 'Gray')
    axs[ax].set_title(x)
    axs[ax].set_ylabel("Appearances", weight = "bold")
    ax+=1
    plt.tight_layout()

In [None]:
# Lets see the appearances of Stars in top rated movies
stars=['Star1','Star2','Star3','Star4']
fig,axs=plt.subplots(4,1,figsize=(20,7))
ax=0
for x in stars:
    s=df.groupby([x]).sum().reset_index()
    d=s.sort_values(['Ratings'],ascending=False)[:10]
    axs[ax].bar(d[x],d['Ratings'], color = 'Gray')
    axs[ax].set_title(x)
    axs[ax].set_ylabel("Appearances", weight = "bold")
    ax+=1
    plt.tight_layout()

In [None]:
# Lets see the appearances of Stars in top meta_scores
stars=['Star1','Star2','Star3','Star4']
fig,axs=plt.subplots(4,1,figsize=(17,7))
ax=0
for x in stars:
    s=df.groupby([x]).sum().reset_index()
    d=s.sort_values(['Metascore'],ascending=False)[:10]
    axs[ax].bar(d[x],d['Metascore'],color = 'Gray')
    axs[ax].set_title(x)
    axs[ax].set_ylabel("Appearances", weight = "bold")
    ax+=1
    plt.tight_layout()


- The star 'Samuel L. Jackson' working on lots of movies and people rated his movies with good ratings and metascore.
- According to Gross collection and votings star 'Robert Downey Jr.' is in highest position from all other stars. 
- People gives maximum votes for his movies. 

In [None]:
plt.figure(figsize=(20, 9))
g=sns.countplot(df['Certificate'])
g.set_title("Count of Certificates provided", weight = "bold")
plt.show()

- Mostaly achieved certificates by movies are 'PG 13', 'R' and 'PG'.

In [None]:
plt.figure(figsize=(20, 9))
genre_counts = pd.concat([df['Genre1'], df['Genre2'], df['Genre3']]).value_counts()
plt.bar(genre_counts.index, genre_counts.values)
plt.xlabel('Genre')
plt.ylabel('Count')
plt.title('Distribution of Movie Genres')
plt.show()

In [None]:
df_by_genre1 = df.groupby('Genre1')
df_by_genre2 = df.groupby('Genre2')
df_by_genre3 = df.groupby('Genre3')
agg_genre1=df_by_genre1.sum()
agg_genre2=df_by_genre2.sum()
agg_genre3=df_by_genre3.sum()
ge1_ge2 = agg_genre1.add(agg_genre2,fill_value=0.0)# fill the missing values with 0.0 before adding the two columns.
final_genre = ge1_ge2.add(agg_genre3,fill_value=0.0)# fill the missing values with 0.0 before adding the two columns.
genre_temp=final_genre.reset_index().head(10)
genre_temp

- Action is Genre which found is Maximum movies.
- After grouping and adding all the Genre we observe that Action has the maximum Revenue, Ratings, Votes and Metascore.
- Adventure movies are on second position according to Votes, Rating and Gross Collection.
- Peoples also like to watch Comedy,Crime,Drama and Fantasy movies but Crime movies Revenue is less than other three Genre.

In [None]:
# Create a histogram of the duration of movies 
plt.figure(figsize=(20, 6))
sns.histplot(data=df, x='Duration in min', bins=20, kde=True)
plt.show()

- Most of Movies Belongs to Duration Range between 80 to 130 Minutes !

In [None]:
list(df['Metascore'])

In [None]:
df['Metascore'] = df['Metascore'].interpolate().astype(int)

- We use interpolation method because it is a method of estimating values between two known values based on some assumed relationship between the values.

In [None]:
list(df['Metascore'])

In [None]:
list(df['Genre2'])

In [None]:
list(df['Genre3'])

In [None]:
df['Genre2'].fillna(df['Genre2'].mode()[0],inplace = True)
df['Genre3'].fillna(df['Genre3'].mode()[0],inplace = True)

In [None]:
df['Star2'].fillna(df['Star2'].mode()[0],inplace = True)
df['Star3'].fillna(df['Star3'].mode()[0],inplace = True)
df['Star4'].fillna(df['Star4'].mode()[0],inplace = True)

In [None]:
df.isnull().sum()

## As Gross Collection is our target coulmn and it has some null values before taking a decision to remove it lets analyse that data

In [None]:
# Store the data in missing values whose Gross Collection is null
missing_values = df.loc[df['Gross Collection'].isna()]
missing_values

In [None]:
missing_values.corr()

- The year of the movie has a weak negative correlation with the metascore and ratings of the movie, which suggests that older movies may be less highly rated than newer movies.

- The duration of the movie has a weak positive correlation with the metascore, ratings, which suggests that longer movies may be more highly rated and more successful at the box office.

- The metascore and ratings have a strong positive correlation, which suggests that they may be measuring similar aspects of the movie's quality.

In [None]:
missing_values['Year'].value_counts()

- If we drop values accrodingly Gross Collection then we are lossing the lots of data of the movies which are released in between 2016 to 2022. 

In [None]:
plt.figure(figsize=(20, 9))
g=sns.countplot(missing_values['Certificate'])
g.set_title("Count of Certificates provided", weight = "bold")
plt.show()

- According to our previous graph of Certificate we have maximum count for PG 13 and R certificates and all other certificates has less count. If we drop missing_values data then We loss details of movies which awarded by certificates like PG 13, PG, R which has maximum count. 

In [None]:
missing_values['Genre1'].value_counts()

In [None]:
missing_values['Genre2'].value_counts()

In [None]:
missing_values['Genre3'].value_counts()

- If we drop the missing_values data we loss the high count of Action, Drama and Adventure Genres.  

- If we loss 15.30% of data then we loss lots of important information. Droping the missing values accordingly target coulumn is not a right decision in this case.

In [None]:
list(df['Gross Collection'])

In [None]:
# Interpolate missing values using linear interpolation
df['Gross Collection'] = df['Gross Collection'].interpolate(method='linear')

# Round values to 2 decimal places
df['Gross Collection'] = round(df['Gross Collection'], 2)


- We use interpolation method because it is a method of estimating values between two known values based on some assumed relationship between the values

In [None]:
list(df['Gross Collection'])

In [None]:
df.isnull().sum().sum()

In [None]:
# Creating a copy of data which is useful for prediction of Ratings
New_df = df.copy()

In [None]:
plt.figure(figsize=(20,6))
# Create scatter plot for Genre1
plt.scatter(df['Genre1'], df['Gross Collection'], alpha=0.5, label='Genre1')

# Create scatter plot for Genre2
plt.scatter(df['Genre2'], df['Gross Collection'], alpha=0.5, label='Genre2')

# Create scatter plot for Genre3
plt.scatter(df['Genre3'], df['Gross Collection'], alpha=0.5, label='Genre3')

# Add labels and title
plt.xlabel('Genres')
plt.ylabel('Gross Collection')
plt.title('Scatter Plot of Genres vs. Gross Collection')

# Add legend
plt.legend()

# Show plot
plt.show()


- By observing above graphs we can see that Gross Collection of Genre not varies according to the Genre. But here we are not taking the decision to drop genre even its not showing good relation with Gross Collection. Because Certain genres tend to be more popular with certain demographics or during specific times of the year.

In [None]:
# Lets see the correlation 
plt.figure(figsize=(20,9))
sns.heatmap(df.corr(),annot= True,linewidths=1,fmt=' .2f',cmap="YlGnBu")
plt.show()

- There is a weak positive correlation (0.13) between Year and Duration in min, suggesting that movies released in more recent years tend to be slightly longer.
- There is a weak negative correlation (-0.14) between Year and Ratings, suggesting that movies released in more recent years tend to be slightly lower rated on average.
- There is a positive correlation between Duration in min and all other variables, with the strongest correlation being with Ratings (0.34), suggesting that longer movies tend to receive higher ratings, more votes, and make more money.
- There is a positive correlation between Votes and Gross Collection (0.68), suggesting that more popular movies tend to make more money.
- There is a positive correlation between Ratings and Gross Collection (0.28), suggesting that movies that are well-rated on IMDb tend to make more money.
- There is a weak positive correlation (0.26) between Metascore and Gross Collection, suggesting that movies with higher scores on Metascore tend to make slightly more money.
- There is a positive correlation between Duration in min and Gross Collection (0.33), suggesting that longer movies tend to make more money. 

In [None]:
plt.figure(figsize=(20, 9))
g=sns.barplot(x=df['Certificate'],y=df['Gross Collection'])
g.set_title("Certificate and Gross Collection", weight = "bold")
plt.show()

- PG 13,PG,13+ certificates awarded movies has a highest Gross Collection than other certificates.

In [None]:
# Get the sorted list of directors by frequency count
sorted_directors = df['Director1'].value_counts().sort_values(ascending=False)

# Group the DataFrame by Director1 and calculate the sum of Gross Collection for each director
director_gross = df.groupby('Director1')['Gross Collection'].sum()

# Select only the directors whose names are in the sorted_directors index
sorted_director_gross = director_gross.loc[sorted_directors.index]

# Display the sorted list of directors with their gross collection
sorted_director_gross


- We can see Director who directed maximum movies does not mean achive the maximum Gross Collection for thier movies. Gross Collection is not showing strong relation with Director.

# Encoding the Categorical column

In [None]:
import category_encoders as ce
# Define the target column
target_col = "Gross Collection"

# Define the categorical columns to be encoded
cat_cols = ["Genre1", "Genre2", "Genre3", "Star1", "Star2", "Star3", "Star4"]

# Create an instance of TargetEncoder and fit it on the data
te = ce.TargetEncoder(cols=cat_cols)
te.fit(df, df[target_col])

# Apply the encoding on the categorical columns
df_encoded = te.transform(df)

# Print the encoded DataFrame
df_encoded.head()


- TargetEncoder is a type of categorical encoding technique that encodes each category in a categorical variable based on the mean target value of the observations in that category. 

In [None]:
df_encoded.info()

In [None]:
# To encode Name column lets check the unique values in name column
print(len(list(df_encoded['Name'].unique())))

In [None]:
print(len(list(df_encoded['Director1'].unique())))

In [None]:
import category_encoders as ce
# Encode the Director1 columns using FrequencyEncoder
fe = ce.CountEncoder(cols=["Director1"])
df_encoded = fe.fit_transform(df_encoded)

# Print the encoded DataFrame
df_encoded.head()

- Frequency encoding is a technique that encodes each category in a categorical variable based on the frequency (or count) of observations in that category. This technique can be used even if the variable has a large number of unique values, and will create a new feature for each category in the variable.

In [None]:
len(list(df_encoded.Certificate.unique()))

In [None]:
from sklearn.preprocessing import LabelEncoder
# Create an instance of LabelEncoder and fit it on the data
le = LabelEncoder()
df_encoded['Certificate'] = le.fit_transform(df_encoded['Certificate'])

In [None]:
from sklearn.preprocessing import LabelEncoder
# Create an instance of LabelEncoder and fit it on the data
le = LabelEncoder()
df_encoded['Name'] = le.fit_transform(df_encoded['Name'])

- LabelEncoder is a simple and effective method for encoding categorical variables that have a small number of unique values.

In [None]:
df_encoded.to_csv('df_encoded.csv')

- Now we have successfully encode all the features. 

In [None]:
# Lets see the correlation 
plt.figure(figsize=(25,15))
sns.heatmap(df_encoded.corr(),annot= True,linewidths=1,fmt=' .2f',cmap="YlGnBu")
plt.show()

- Here Name, Director1 and Genre1 columns shows very weak correlation with dependent and independent features.
- Even Genre1 column is also showing very weak correlation but Genres are divided into three parts droping one column is not a good dicision because in above analysis we saw that Action Genre has Maximim Gross collection than other Genres and Action Genre count is maximum in Genre1 column dropping it may loss the important information. 
- Here we drop Name because they are not adding that much values in our dataset.

In [None]:
df_encoded.drop(['Director1','Name'],axis = 1,inplace = True)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Create a new dataframe with all the features (numeric and categorical)
X = df_encoded[['Year','Duration in min','Votes','Ratings','Metascore']]

# Calculate VIF values for each featur
vif = pd.DataFrame()
vif["Feature"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Print the VIF values for each feature
print(vif)


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Create a new dataframe with all the features (numeric and categorical)
X = df_encoded[['Duration in min','Votes','Year','Metascore']]

# Calculate VIF values for each feature
vif = pd.DataFrame()
vif["Feature"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Print the VIF values for each feature
print(vif)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Create a new dataframe with all the features (numeric and categorical)
X = df_encoded[['Votes','Duration in min','Metascore']]

# Calculate VIF values for each feature
vif = pd.DataFrame()
vif["Feature"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Print the VIF values for each feature
print(vif)

- Here we choose to drop Rating and Metascore to avoid multicollinearity. Also Year showing very weak correlation with dependent and independent features hence we choose to drop it. Again We have very high VIF but duration shows quit high positive correlation with target column so we are not choose to drop any other feature right now. Also we are not getting strong relation with certificate and gross collection hence choose to drop that.

In [None]:
df_encoded.drop(['Metascore','Year','Certificate','Ratings'],axis = 1, inplace = True)

In [None]:
df_encoded.head(5)

# Ckeking for Outliers

In [None]:
# calculate the interquartile range (IQR)
Q1 = df_encoded['Votes'].quantile(0.25)
Q3 = df_encoded['Votes'].quantile(0.75)
IQR = Q3 - Q1

# calculate the upper and lower bounds for outliers
upper_bound = Q3 + 1.5*IQR
lower_bound = Q1 - 1.5*IQR

# count the number of outliers
outliers = df_encoded.loc[(df['Votes'] < lower_bound) | (df_encoded['Votes'] > upper_bound)]
outliers['Votes']

In [None]:
# calculate the interquartile range (IQR)
Q1 = df_encoded['Duration in min'].quantile(0.25)
Q3 = df_encoded['Duration in min'].quantile(0.75)
IQR = Q3 - Q1

# calculate the upper and lower bounds for outliers
upper_bound = Q3 + 1.5*IQR
lower_bound = Q1 - 1.5*IQR

# count the number of outliers
outliers = df_encoded.loc[(df['Duration in min'] < lower_bound) | (df_encoded['Duration in min'] > upper_bound)]
outliers['Duration in min']

In [None]:
# calculate the interquartile range (IQR)
Q1 = df_encoded['Gross Collection'].quantile(0.25)
Q3 = df_encoded['Gross Collection'].quantile(0.75)
IQR = Q3 - Q1

# calculate the upper and lower bounds for outliers
upper_bound = Q3 + 1.5*IQR
lower_bound = Q1 - 1.5*IQR

# count the number of outliers
outliers = df_encoded.loc[(df['Gross Collection'] < lower_bound) | (df_encoded['Gross Collection'] > upper_bound)]
outliers['Gross Collection']

- Here We are not dealing with outliers.
- We can see if we remove the ouliers we loss the maximum Gross Collections, Durations and Votes which will be a good factor for prediction. 

In [None]:
# Lets see the skewness 
df_encoded.skew()

- Here we can see Ratings and Certificates are skewed towords left and other columns skewd toword right.

In [None]:
# Divide the data into features and vectors.
X=df_encoded.drop(['Gross Collection'], axis=1)
y=df_encoded['Gross Collection'] 


In [None]:
X[0:2]

In [None]:
y[0:2]

In [None]:
X=np.log(X)

In [None]:
from sklearn.preprocessing import StandardScaler

# Create an instance of the scaler
scaler = StandardScaler()

# Fit the scaler to the data and transform the data
X = scaler.fit_transform(X)

In [None]:
from scipy.stats import skew
skewness = skew(X)
print("Skewness:", skewness)

In [None]:
# create a distplot
sns.distplot(X, kde=True, hist=True)

# set the title and axis labels
plt.title('Distribution Plot')
plt.xlabel('Values')
plt.ylabel('Frequency')


In [None]:
# Split the data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test= train_test_split(X,y,train_size=0.80,random_state=0)

In [None]:
print(x_train.shape)
print(y_train.shape)

In [None]:
print(x_test.shape)
print(y_test.shape)

# Model building

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np

# Load your data into X and y variables here

# Define a range of random state values to try
random_states = np.arange(100)

# Initialize variables to keep track of the best model and its performance
best_r2_score = -np.inf
best_random_state = None
best_model = None

# Loop through all the random state values to find the best one
for random_state in random_states:
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

    # Fit a linear regression model to the training data
    model = LinearRegression().fit(X_train, y_train)

    # Make predictions on the testing data
    y_pred = model.predict(X_test)

    # Compute the R-squared score for this model
    r2 = r2_score(y_test, y_pred)

    # Check if this model has a better R-squared score than the previous best model
    if r2 > best_r2_score:
        best_r2_score = r2
        best_random_state = random_state
        best_model = model

# Print the best random state and its corresponding R-squared score
print("Best random state:", best_random_state)
print("Best R-squared score:", best_r2_score)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.80, random_state=2)

In [None]:
from sklearn.model_selection import cross_val_score
best_cv = None
best_score = 0

for cv in range(3, 11):
    cv_scores = cross_val_score(model, X, y, cv=cv, scoring='r2')
    cv_score = cv_scores.mean()
    print('{} at this fold score is {}'.format(cv,cv_score))


- We can see at 10 fold we get best r2 score than other fold.

In [None]:
plt.scatter(y_test, y_pred)
plt.title('Actual vs. Predicted Values (R2 = {:.2f})'.format(r2))
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()


# Lasso regression

In [None]:
from sklearn.linear_model import Lasso
lasso = Lasso()

# Fit a linear regression model
lasso_reg = lasso.fit(x_train, y_train)

# Predict on test data
lasso_y_pred = lasso.predict(x_test)

r2 = r2_score(y_test,lasso_y_pred)

print("R2 Score", r2)

In [None]:
# Evaluate model performance using cross-validation
cv_scores = cross_val_score(lasso, X, y, cv=10, scoring='r2')
# Print the mean and standard deviation of the mean squared error
print(cv_scores.mean())

In [None]:
plt.scatter(y_test, lasso_y_pred)
plt.title('Actual vs. Predicted Values (R2 = {:.2f})'.format(r2))
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()


# Ridge regression

In [None]:
from sklearn.linear_model import Ridge
# Define Ridge model with cross-validation
ridge = Ridge()

# Fit a linear regression model
ridge_reg = ridge.fit(x_train, y_train)

# Predict on test data
ridge_y_pred = ridge.predict(x_test)

r2 = r2_score(y_test, ridge_y_pred)

print(r2)

In [None]:
# Evaluate model performance using cross-validation
cv_scores = cross_val_score(ridge,X, y, cv=10, scoring='r2')
cv_mean = np.mean(cv_scores)

print(cv_mean)

In [None]:
plt.scatter(y_test, ridge_y_pred)
plt.title('Actual vs. Predicted Values (R2 = {:.2f})'.format(r2))
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()


# KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor
# Define KNeighborsRegressor model with cross-validation
knn = KNeighborsRegressor()
knn_reg = knn.fit(x_train,y_train)
# Predict on test data
knn_y_pred = knn.predict(x_test)

r2 = r2_score(y_test, knn_y_pred)

print('R2 score', r2)

In [None]:
# Evaluate model performance using cross-validation
cv_scores = cross_val_score(knn, X, y, cv=10, scoring='r2')
cv_mean = np.mean(cv_scores)
cv_mean

In [None]:
plt.scatter(y_test, knn_y_pred)
plt.title('Actual vs. Predicted Values (R2 = {:.2f})'.format(r2))
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()


# SVR

In [None]:
from sklearn.svm import SVR

# Define the SVR model
svr = SVR(kernel='linear')

# Fit the model on the training data
svr.fit(x_train, y_train)

# Make predictions on the testing data
svr_y_pred = svr.predict(x_test)

# Compute the R2 score for this model
r2 = r2_score(y_test, svr_y_pred)

# Print the R2 score
print("R2 score:", r2)

In [None]:
# Evaluate model performance using cross-validation
cv_scores = cross_val_score(svr, X, y, cv=10, scoring='r2')
cv_mean = np.mean(cv_scores)
cv_mean

In [None]:
plt.scatter(y_test, svr_y_pred)
plt.title('Actual vs. Predicted Values (R2 = {:.2f})'.format(r2))
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()


# Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt=DecisionTreeRegressor()
dt_reg = dt.fit(x_train, y_train)
dt_y_pred=dt.predict(x_test)

r2 = r2_score(y_test, dt_y_pred)

print("R2 score",r2)

In [None]:
# Evaluate model performance using cross-validation
cv_scores = cross_val_score(dt, X, y, cv=10, scoring='r2')
cv_mean = np.mean(cv_scores)
cv_mean

In [None]:
plt.scatter(y_test, dt_y_pred)
plt.title('Actual vs. Predicted Values (R2 = {:.2f})'.format(r2))
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()


# Random Forest regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Define Random Forest Regressor model
rf = RandomForestRegressor()

# Fit the model on training data
rf_reg = rf.fit(x_train, y_train)

# Make predictions on test set
rf_y_pred = rf.predict(x_test)

r2 = r2_score(y_test, rf_y_pred)

print("R2 score:", r2)

In [None]:
# Evaluate model performance using cross-validation
cv_scores = cross_val_score(knn, X, y, cv=10, scoring='r2')
cv_mean = np.mean(cv_scores)
cv_mean

In [None]:
plt.scatter(y_test, rf_y_pred)
plt.title('Actual vs. Predicted Values (R2 = {:.2f})'.format(r2))
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.show()

# Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Define Gradient Boosting model with cross-validation
gb = GradientBoostingRegressor(random_state=2)
gb_reg = gb.fit(x_train, y_train)
gb_y_pred = gb.predict(x_test)

r2 = r2_score(y_test, gb_y_pred)

print("R2 score:", r2)

# Evaluate model performance using cross-validation
cv_scores = cross_val_score(gb, X, y, cv=10, scoring='r2')
cv_mean = np.mean(cv_scores)
print("cv_mean",cv_mean)

- For R2 score, a higher value indicates a better model fit. R2 score measures how well the model explains the variance in the target variable. So, if we have an R2 score, the cross-validation score to be close to the R2 score or at least not significantly lower than the R2 score.
- Gradient Boosting shows less difference in r2 score and cv score. Lets do the hyper parameter tuning on it.

# Hyper parameter tuning for Gradient Boosting.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint
# Define the hyperparameter grid to search over
param_grid = {
    'n_estimators': randint(50, 200),
    'learning_rate': uniform(0.01, 0.5),
    'max_depth': randint(1, 10),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform random search cross-validation to find the best hyperparameters
gb_random = RandomizedSearchCV(gb, param_distributions=param_grid, cv=10, scoring='r2', n_iter=50, random_state=2)
gb_random.fit(x_train, y_train)

# Print the best hyperparameters and their corresponding R2 score
print("Best hyperparameters:", gb_random.best_params_)
print("Best R2 score:", gb_random.best_score_)


In [None]:
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV

# Define HistGradientBoostingRegressor model
hgb = HistGradientBoostingRegressor(random_state=2)

# Define the hyperparameter grid to search over
param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [1, 3, 5, 9],
    'min_samples_leaf': [1, 3, 5]
}

# Perform grid search cross-validation to find the best hyperparameters
hgb_grid = GridSearchCV(hgb, param_grid, cv=10, scoring='r2')
hgb_grid.fit(x_train, y_train)

# Print the best hyperparameters and their corresponding R2 score
print("Best hyperparameters:", hgb_grid.best_params_)
print("Best R2 score:", hgb_grid.best_score_)

# Evaluate the model on the test data
hgb_y_pred = hgb_grid.predict(x_test)
r2 = r2_score(y_test, hgb_y_pred)
print("R2 score:", r2)


In [None]:
# Evaluate model performance using cross-validation
cv_scores = cross_val_score(hgb, X, y, cv=10, scoring='r2')
cv_mean = np.mean(cv_scores)
print("cv_mean",cv_mean)

- The corresponding R2 score of the model using these hyperparameters is 0.636, which is the best R2 score achieved by any combination of hyperparameters during the hyperparameter tuning process.


- Then the model was used to make predictions on the test set, and the R2 score was found to be 0.735. This suggests that the model has a good performance on the test set, with an R2 score of 0.735 indicating that 73.5% of the variance in the target variable can be explained by the model.


- In cases where the dataset contains outliers and it is not desirable to remove them, R2 score can be a more suitable metric than mean squared error (MSE). R2 score measures the proportion of variance in the target variable that is explained by the model, and is less sensitive to outliers than MSE.


- HistGradientBoostingRegressor is a gradient boosting algorithm for regression tasks, introduced in Scikit-learn version 0.23. It is based on histograms and gradient boosting, which makes it very fast and scalable for large datasets. Instead of using decision trees, HistGradientBoostingRegressor builds histograms of the features, and computes the gradients based on these histograms. It also uses the L2 regularization to avoid overfitting.


- In our case after doing hyper parameter r2 score get reduced hence we used HistGradientBoostingRegressor.

In [None]:
# Save the best model
import joblib
joblib.dump(hgb_grid.best_estimator_, 'best_model.pkl')

#  To Predict Ratings

In [None]:
New_df

In [None]:
# Lets see the correlation 
plt.figure(figsize=(25,15))
sns.heatmap(New_df.corr(),annot= True,linewidths=1,fmt=' .2f',cmap="YlGnBu")
plt.show()

- Year column showing weak correlation with dependent and independent column hence choose to drop it.

In [None]:
New_df.drop('Year',axis = 1, inplace = True)

In [None]:
# Get the sorted list of directors by frequency count
sorted_directors = New_df['Director1'].value_counts().sort_values(ascending=False)

# Group the DataFrame by Director1 and calculate the sum of Gross Collection for each director
director_rating = New_df.groupby('Director1')['Ratings'].sum()

# Select only the directors whose names are in the sorted_directors index
sorted_director_rating = director_rating.loc[sorted_directors.index]

# Display the sorted list of directors with their gross collection
sorted_director_rating

In [None]:
plt.figure(figsize=(20, 9))
g=sns.barplot(x=New_df['Certificate'],y=New_df['Ratings'])
g.set_title("Certificate and Gross Collection", weight = "bold")
plt.show()

- We can see movies rating are showing strong relation with certificates. If movie is not awarded by any certificate then also its rating is greater than 6.5 which we can say medium rating according to range. 

In [None]:
plt.figure(figsize=(20,6))
# Create scatter plot for Genre1
plt.scatter(df['Genre1'], df['Ratings'], alpha=0.5, label='Genre1')

# Create scatter plot for Genre2
plt.scatter(df['Genre2'], df['Ratings'], alpha=0.5, label='Genre2')

# Create scatter plot for Genre3
plt.scatter(df['Genre3'], df['Ratings'], alpha=0.5, label='Genre3')

# Add labels and title
plt.xlabel('Genres')
plt.ylabel('Ratings')
plt.title('Scatter Plot of Genres vs. Ratings')

# Add legend
plt.legend()

# Show plot
plt.show()


- We can see here Genre shows good relationship with ratings.

In [None]:
import category_encoders as ce
# Define the target column
target_col = "Ratings"

# Define the categorical columns to be encoded
cat_cols = ["Genre1", "Genre2", "Genre3", "Star1", "Star2", "Star3", "Star4"]

# Create an instance of TargetEncoder and fit it on the data
te = ce.TargetEncoder(cols=cat_cols)
te.fit(New_df, New_df[target_col])

# Apply the encoding on the categorical columns
df_encoded = te.transform(New_df)

# Print the encoded DataFrame
df_encoded.head()


In [None]:
import category_encoders as ce
# Encode the Name and Director1 columns using FrequencyEncoder
fe = ce.CountEncoder(cols=["Director1"])
df_encoded = fe.fit_transform(df_encoded)

# Print the encoded DataFrame
df_encoded.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
# Create an instance of LabelEncoder and fit it on the data
le = LabelEncoder()
df_encoded['Certificate'] = le.fit_transform(df_encoded['Certificate'])

In [None]:
df_encoded['Name'] = le.fit_transform(df_encoded['Name'])

In [None]:
df_encoded.head(1)

In [None]:
# Lets see the correlation 
plt.figure(figsize=(25,15))
sns.heatmap(df_encoded.corr(),annot= True,linewidths=1,fmt=' .2f',cmap="YlGnBu")
plt.show()

In [None]:
df_encoded.drop(['Name','Certificate'],axis = 1, inplace = True)

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Create a new dataframe with all the features (numeric and categorical)
X = df_encoded[['Duration in min','Votes','Metascore','Gross Collection']]

# Calculate VIF values for each featur
vif = pd.DataFrame()
vif["Feature"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Print the VIF values for each feature
print(vif)


In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
# Create a new dataframe with all the features (numeric and categorical)
X = df_encoded[['Votes','Metascore','Gross Collection']]

# Calculate VIF values for each featur
vif = pd.DataFrame()
vif["Feature"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

# Print the VIF values for each feature
print(vif)

In [None]:
df_encoded.drop('Duration in min',axis = 1,inplace = True)

In [None]:
list(df_encoded['Ratings'].unique())

- Here we convert choose to convert certain ranges into classes and try to solve this problem using classification to reduce the complexity of problem which may occur during prediction if we solve it by regression problem.

In [None]:
# Define the rating ranges and corresponding labels
ranges = [(1.0, 1.9), (2.0, 2.9), (3.0, 3.9), (4.0, 4.9), (5.0, 5.9), (6.0, 6.9), (7.0, 7.9), (8.0, 8.9), (9.0, 9.9)]
labels = [1, 2, 3, 4, 5, 6, 7, 8, 9]

# Create a function to assign labels based on the rating ranges
def assign_label(rating):
    for i, r in enumerate(ranges):
        if rating >= r[0] and rating <= r[1]:
            return labels[i]
    return None

# Apply the function to create a new column with the labels
df_encoded['Ratings'] = df_encoded['Ratings'].apply(assign_label)

# Print the resulting DataFrame with labels
df_encoded.head()


In [None]:
df_encoded.skew()

In [None]:
def box_plot(f):
    df_encoded[f].plot.box()
    plt.show()

In [None]:
box_plot('Metascore')

In [None]:
box_plot('Votes')

In [None]:
box_plot('Gross Collection')

# Removing outliers

In [None]:
from scipy.stats import zscore
z= np.abs(zscore(df_encoded))

In [None]:
threshold= 3 
print(np.where(z>3))

In [None]:
df2=df_encoded[(z<3).all(axis=1)]
print(df_encoded.shape)
print(df2.shape)

In [None]:
data_loss = (1725-1453)/1725*100
data_loss

In [None]:
# Divide the data into features and vectors.
X=df2.drop(['Ratings'], axis=1)
y=df2['Ratings'] 

In [None]:
X[0:2]

In [None]:
y[0:2]

In [None]:
X=np.sqrt(X)

In [None]:
# Create scaler object and fit to data
scaler = StandardScaler()
scaler.fit(X)

# Transform data
X = scaler.transform(X)

In [None]:
from scipy.stats import skew
skewness = skew(X)
print("Skewness:", skewness)

In [None]:
skewness = skew(X)
print("Skewness:", skewness)

In [None]:
# create a distplot
sns.distplot(X, kde=True, hist=True)

# set the title and axis labels
plt.title('Distribution Plot')
plt.xlabel('Values')
plt.ylabel('Frequency')


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
best_score = 0
best_rs = 0

# Loop over different random states to find the best one
for rs in range(100):
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rs)

    # Create a logistic regression model and fit it to the training data
    clf = LogisticRegression(random_state=rs)
    clf.fit(X_train, y_train)

    # Evaluate the model on the testing data
    score = clf.score(X_test, y_test)

    # Update the best score and random state if necessary
    if score > best_score:
        best_score = score
        best_rs = rs

print(f'Best random state: {best_rs}, Best test score: {best_score:.3f}')


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
# Make predictions on the testing data
y_pred = clf.predict(X_test)

# Compute evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
recall = recall_score(y_test, y_pred, average='macro',zero_division=1)
f1 = f1_score(y_test, y_pred, average='macro',zero_division=1)
cm = confusion_matrix(y_test, y_pred)


# Print the evaluation metrics
print(f'Accuracy: {accuracy:.3f}')
print(f'Precision: {precision:.3f}')
print(f'Recall: {recall:.3f}')
print(f'F1 score: {f1:.3f}')
print('Confusion matrix:')
print(cm)

- The overall accuracy of the model is 0.557, which is not very high, indicating that the model is not very accurate in predicting the correct class labels.
- The precision score is 0.394, which means that when the model predicts a class label, it is correct 39.4% of the time on average across all classes.
- The recall score is 0.562, which means that the model correctly identifies only 56.2% of the positive class instances across all classes.
- The F1 score is 0.394, which is a harmonic mean of precision and recall scores, indicating that the model is not performing well on both metrics.
- Overall, while the model may have some predictive power, it is not performing very well on this dataset. Further analysis and improvement of the model may be required.

In [None]:
from sklearn.model_selection import cross_val_score
# Create a logistic regression model
clf = LogisticRegression(random_state=26)

# Perform cross-validation and compute the mean accuracy score
for cv in range(3,9):
    cv_scores = cross_val_score(clf, X, y, cv=cv,scoring = 'accuracy')
    mean_cv_score = cv_scores.mean()
    print("{} at this fold cv_score is {}".format(cv,mean_cv_score))


- At fold 8 we get good accuracy hence we choose 8 as a best fold.

In [None]:
from imblearn.over_sampling import RandomOverSampler

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=26)

# Balance the classes on the training set only
ros = RandomOverSampler(random_state=26)
X_train, y_train = ros.fit_resample(X_train, y_train)

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Create a decision tree classifier and fit it to the training data
clf = DecisionTreeClassifier(random_state=26)
clf.fit(X_train, y_train)

# Evaluate the model on the testing data
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted',zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted',zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted',zero_division=1)
conf_matrix = confusion_matrix(y_test, y_pred)


# Print the evaluation metrics
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 score: {f1:.3f}")
print("Confusion matrix:")
print(conf_matrix)


In [None]:
# Perform cross-validation and compute the mean accuracy score
cv_scores = cross_val_score(clf, X, y, cv=8, scoring='accuracy')
mean_cv_score = cv_scores.mean()
mean_cv_score

# Random Forect Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
# Create a random forest classifier and fit it to the training data
rfc = RandomForestClassifier(random_state=26)
rfc.fit(X_train, y_train)

# Evaluate the model on the testing data
score = rfc.score(X_test, y_test)
y_pred = rfc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted',zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted',zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted',zero_division=1)
conf_matrix = confusion_matrix(y_test, y_pred)


# Print the evaluation metrics
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 score: {f1:.3f}")
print("Confusion matrix:")
print(conf_matrix)

In [None]:
# Perform cross-validation and compute the mean accuracy score
cv_scores = cross_val_score(rfc, X, y, cv=8, scoring='accuracy')
mean_cv_score = cv_scores.mean()
mean_cv_score

In [None]:
from sklearn.svm import SVC
# Create a Support vector classifier and fit it to the training data
svc = SVC(random_state=26)
svc.fit(X_train, y_train)

# Evaluate the model on the testing data
score = svc.score(X_test, y_test)
y_pred = svc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted',zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted',zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted',zero_division=1)
conf_matrix = confusion_matrix(y_test, y_pred)


# Print the evaluation metrics
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 score: {f1:.3f}")
print("Confusion matrix:")
print(conf_matrix)

In [None]:
# Perform cross-validation and compute the mean accuracy score
cv_scores = cross_val_score(svc, X, y, cv=8, scoring='accuracy')
mean_cv_score = cv_scores.mean()
mean_cv_score

# KNN classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Evaluate the model on the testing data
score = knn.score(X_test, y_test)
y_pred = knn.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted',zero_division=1)
recall = recall_score(y_test, y_pred, average='weighted',zero_division=1)
f1 = f1_score(y_test, y_pred, average='weighted',zero_division=1)
conf_matrix = confusion_matrix(y_test, y_pred)


# Print the evaluation metrics
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 score: {f1:.3f}")
print("Confusion matrix:")
print(conf_matrix)

In [None]:
# Perform cross-validation and compute the mean accuracy score
cv_scores = cross_val_score(knn, X, y, cv=8, scoring='accuracy')
mean_cv_score = cv_scores.mean()
mean_cv_score

- As comparing other models Random Forest model perform well. We coose random forest as a best model based on matrices.

In [None]:
from sklearn.model_selection import GridSearchCV

# Create a Random Forest classifier
rfc = RandomForestClassifier(random_state=26)

# Define the hyperparameters to tune
param_grid = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 20, 30, 40],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Perform a grid search with cross-validation
grid_search = GridSearchCV(rfc, param_grid=param_grid, cv=8, n_jobs=-1)

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print('Best parameters:', grid_search.best_params_)

# Print the accuracy score of the best model
print('Accuracy score:', grid_search.best_score_)


In [None]:
import pickle

# train your best model and save it
best_model = RandomForestClassifier(max_depth=20, max_features='sqrt', min_samples_leaf=1, min_samples_split=2, n_estimators=400)
best_model.fit(X_train, y_train)

# save the best model using pickle
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
