

```
Loading libraries
```



In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



Loading dataset

In [18]:
from google.colab import files
import pandas as pd
uploaded = files.upload()
file_name = list(uploaded.keys())[0]
try:
    data = pd.read_csv(file_name, encoding='latin1')
    print("File read successfully with latin1 encoding")
except UnicodeDecodeError:
    try:
        data = pd.read_csv(file_name, encoding='iso-8859-1')
        print("File read successfully with iso-8859-1 encoding")
    except UnicodeDecodeError:
        data = pd.read_csv(file_name, encoding='cp1252')
        print("File read successfully with cp1252 encoding")


print(data.head())


Saving IMDb Movies India.csv to IMDb Movies India (3).csv
File read successfully with latin1 encoding
                                 Name    Year Duration            Genre  \
0                                         NaN      NaN            Drama   
1  #Gadhvi (He thought he was Gandhi) -2019.0  109 min            Drama   
2                         #Homecoming -2021.0   90 min   Drama, Musical   
3                             #Yaaram -2019.0  110 min  Comedy, Romance   
4                   ...And Once Again -2010.0  105 min            Drama   

   Rating Votes            Director       Actor 1             Actor 2  \
0     NaN   NaN       J.S. Randhawa      Manmauji              Birbal   
1     7.0     8       Gaurav Bakshi  Rasika Dugal      Vivek Ghamande   
2     NaN   NaN  Soumyajit Majumdar  Sayani Gupta   Plabita Borthakur   
3     4.4    35          Ovais Khan       Prateik          Ishita Raj   
4     NaN   NaN        Amol Palekar  Rajat Kapoor  Rituparna Sengupta   

        

Some basic functions

In [22]:
print(data.tail())

                      Name    Year Duration          Genre  Rating Votes  \
15504  Zulm Ko Jala Doonga -1988.0      NaN         Action     4.6    11   
15505                Zulmi -1999.0  129 min  Action, Drama     4.5   655   
15506            Zulmi Raj -2005.0      NaN         Action     NaN   NaN   
15507        Zulmi Shikari -1988.0      NaN         Action     NaN   NaN   
15508         Zulm-O-Sitam -1998.0  130 min  Action, Drama     6.2    20   

            Director           Actor 1         Actor 2        Actor 3  
15504  Mahendra Shah  Naseeruddin Shah   Sumeet Saigal  Suparna Anand  
15505     Kuku Kohli      Akshay Kumar  Twinkle Khanna    Aruna Irani  
15506     Kiran Thej   Sangeeta Tiwari             NaN            NaN  
15507            NaN               NaN             NaN            NaN  
15508   K.C. Bokadia        Dharmendra      Jaya Prada    Arjun Sarja  


In [24]:
print(data.describe())

               Year       Rating
count  14981.000000  7919.000000
mean   -1987.012215     5.841621
std       25.416689     1.381777
min    -2022.000000     1.100000
25%    -2009.000000     4.900000
50%    -1991.000000     6.000000
75%    -1968.000000     6.800000
max    -1913.000000    10.000000


In [27]:
print(data.columns)

Index(['Name', 'Year', 'Duration', 'Genre', 'Rating', 'Votes', 'Director',
       'Actor 1', 'Actor 2', 'Actor 3'],
      dtype='object')


Selecting features

In [38]:
features = ['Director_name', 'Actor_1_name', 'Actor_2_name', 'Actor_3_name', 'Rating', 'Votes']
target = 'Rating'





drooping missing target values

In [39]:
data= data.dropna(subset=[target])

Handling missing values

In [49]:
for feature in features:
  if feature in data.columns:
          data[feature].fillna('Unknown',inplace=True)
  else:
               print(f"Feature '{feature}' not found in columns")

Feature 'Director_name' not found in columns
Feature 'Actor_1_name' not found in columns
Feature 'Actor_2_name' not found in columns
Feature 'Actor_3_name' not found in columns


Encode categorical variables

In [53]:
categorical_features=['director_name','actor_1_name','actor_2_name','actor_3_name','genres']
numerical_features=['budget','num_genres']

Encode numerical varibale

In [55]:
numerical_transformer=SimpleImputer(strategy='median')

Preprocessing the categorical and numerical data

In [58]:
preprocessor=ColumnTransformer(
    transformers=[
        ('num',numerical_transformer,numerical_features),
        ('cat',categorical_transformer,categorical_features)
    ])


Defining model

In [59]:
model=RandomForestRegressor(n_estimators=100,random_state=42)

Creating pipeline

In [60]:
pipeline =Pipeline(steps=[('preprocessor',preprocessor),
                          ('model',model)
                          ])

Splitting the data

In [61]:
x= data.drop(target,axis=1)
y = data[target]
x_train,x_test,y_train, y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
full code and finding accuracy

In [75]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

data = pd.DataFrame({
    'category1': ['A', 'B', 'A', None],
    'category2': ['X', 'Y', 'Z', 'X'],
    'budget': [1000, 1500, 1200, 1300],
    'target': [0, 1, 0, 1]  # Classification target
})

# Features and target
features = ['category1', 'category2', 'budget']
target = 'target'

# Splitting the  data
X = data[features]
y = data[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the categorical transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define the column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, ['category1', 'category2']),
        # If 'budget' is a numerical feature, handle it separately if needed
        ('num', 'passthrough', ['budget'])
    ]
)

# Define the pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())  # Using RandomForestClassifier for classification
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Predict and evaluate
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))




Accuracy: 0.0


Making predictions

In [72]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')


RMSE: 1.0
MAE: 1.0
R2: nan


