In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
dataset = pd.read_csv("cinemart_dataset.csv")
numcol = dataset.select_dtypes(include=[np.number]).columns
charcol = dataset.select_dtypes(exclude=[np.number]).columns

In [None]:

# numerical data changed to mean
num_imputer = SimpleImputer(strategy='mean')
dataset[numcol] = num_imputer.fit_transform(dataset[numcol])

# character data changed to most frequent
charimp = SimpleImputer(strategy='most_frequent')
dataset[charcol] = charimp.fit_transform(dataset[charcol])


In [None]:

# negative is invalid
dataset.loc[dataset["Monthly_Fee_USD"]<0,"Monthly_Fee_USD"] = np.nan


In [None]:

#handling bad entries
dataset["Country"] = dataset["Country"].str.strip().str.title()
dataset["Country"] = dataset["Country"].replace({"U.K": "United Kingdom"})

dataset["Subscription_Plan"] = dataset["Subscription_Plan"].str.strip().str.title()
dataset["Subscription_Plan"] = dataset["Subscription_Plan"].replace({
    "B": "Basic"
})

dataset["Device_Type"] = dataset["Device_Type"].str.strip().str.title()
dataset["Device_Type"] = dataset["Device_Type"].replace({
    "Andrd": "Android",
    "Ios": "iOS"
})

dataset["Genre"] = dataset["Genre"].str.strip().str.title()
dataset["Genre"] = dataset["Genre"].replace({
    "Dramma": "Drama"
})

dataset["Payment_Method"] = dataset["Payment_Method"].str.strip().str.title()
dataset["Payment_Method"] = dataset["Payment_Method"].replace({
    "Cc": "Credit Card",
    "Card": "Credit Card"
})


In [None]:

countwatch = dataset.groupby("Country")["Watch_Time_Minutes"].sum()
top5 = countwatch.sort_values(ascending=False).head(5)
print("Top 5 countries by total watch time")
print(top5)


In [None]:

top5.plot(kind="bar", color="blue")
plt.title("Top 5 countries by total watch time")
plt.ylabel("Total Watch Time (Minutes)")
plt.xlabel("Country")
plt.show()


In [None]:

genre = dataset.groupby("Genre")["Watch_Time_Minutes"].sum()
pop = genre.sort_values(ascending=False)
print("Most Popular Genres (by Total Watch Time):")
print(pop.head(10))


In [None]:

pop.head(10).plot(kind="bar", color="blue")
plt.title("Top 10 Genres by Total Watch Time")
plt.ylabel("Total Watch Time (Minutes)")
plt.xlabel("Genre")
plt.show()


In [None]:

avg = dataset.groupby("Subscription_Plan")["Monthly_Fee_USD"].mean().round(2)
print("Average Monthly Fee for Each Subscription Plan:")
print(avg)


In [None]:

avg.plot(kind="bar", color="blue")
plt.title("Average Monthly Fee by Subscription Plan")
plt.ylabel("Average Monthly Fee (USD)")
plt.xlabel("Subscription Plan")
plt.show()


In [None]:

avgrat = dataset.groupby("Genre")["Average_Rating"].mean().round(2)
print("Average Rating for Each Genre:")
print(avgrat)


In [None]:

avgrat.sort_values(ascending=False).plot(kind="bar", color="blue")
plt.title("Average Rating by Genre")
plt.ylabel("Average Rating")
plt.xlabel("Genre")
plt.show()


In [None]:

avgwatch = dataset.groupby("Subscription_Plan")["Watch_Time_Minutes"].mean().round(2)
print("Average Watch Time per Subscription Plan:")
print(avgwatch)
avgwatch.plot(kind="bar", color="blue")
plt.title("Average Watch Time by Subscription Plan")
plt.ylabel("Average Watch Time (Minutes)")
plt.xlabel("Subscription Plan")
plt.show()


In [None]:

device = dataset.groupby("Device_Type")["Watch_Time_Minutes"].sum().sort_values(ascending=False)
print("Total Watch Time by Device Type:")
print(device)
device.plot(kind="pie", autopct="%1.1f%%", colors=["gold", "blue", "green", "orange"])
plt.title("Watch Time Share by Device Type")
plt.show()