In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import shap
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("anime.csv")  # Replace with actual dataset path

# Handling missing values
for column in df.select_dtypes(include=np.number).columns:
    df[column].fillna(df[column].median(), inplace=True)

# Feature Engineering: Adding interaction terms, user-item interactions
# Converting 'name' and 'anime_id' to numerical type before multiplication
df['name'] = pd.factorize(df['name'])[0]  # Convert 'name' to numerical representation
df['anime_id'] = pd.factorize(df['anime_id'])[0]  # Convert 'anime_id' to numerical representation
df['User_Item_Interaction'] = df['name'] * df['anime_id']  # Example feature

# Encoding categorical variables
df = pd.get_dummies(df, columns=['genre', 'type'], drop_first=True)

# Splitting dataset into features and target
X = df.drop(columns=['rating'])  # Replace 'Recommendation' with actual target column
y = df['rating']

# Ensure y is of numeric type and handle potential errors
y = pd.to_numeric(y, errors='coerce')  # Convert to numeric, replace invalid with NaN
y.fillna(y.median(), inplace=True)  # Fill NaN with median if any

# ---Changes start here---
# Identify and convert or drop columns with 'Unknown'
for col in X.select_dtypes(include=['object']).columns:
    if X[col].str.contains('Unknown').any():
        # Option 1: Convert 'Unknown' to a numerical value (e.g., -1)
        # X[col] = X[col].replace('Unknown', -1)
        # Option 2: Drop the column
        X = X.drop(columns=[col])

# Feature Scaling after handling 'Unknown' values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Train-test split
# Assuming y_resampled should be y, as y_resampled is not defined in the original code
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model Training with Hyperparameter Tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Choose appropriate model based on the nature of your target variable 'rating'
# If 'rating' is continuous, use RandomForestRegressor
# If 'rating' is categorical, use RandomForestClassifier
# Here, assuming 'rating' is continuous, but change to Classifier if needed
model = RandomForestRegressor(random_state=42)
# model = RandomForestClassifier(random_state=42) # Uncomment if 'rating' is categorical

grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best Model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Choose appropriate evaluation metrics based on the model type (Regressor or Classifier)
if isinstance(best_model, RandomForestRegressor):
    print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
    print("R-squared:", r2_score(y_test, y_pred))
elif isinstance(best_model, RandomForestClassifier):
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

# Feature Importance Analysis
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names=X.columns)

# Visualization
feature_importances = pd.Series(best_model.feature_importances_, index=X.columns)
feature_importances.nlargest(10).plot(kind='barh')
plt.title("Top 10 Important Features")
plt.show()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].median(), inplace=True)


In [None]:
print(df.columns)

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')


##Interview questions

1.Difference Between User-Based and Item-Based Collaborative Filtering
Collaborative filtering is a technique used in recommendation systems that suggests items based on user behavior and preferences. It comes in two main types:

Feature	User-Based Collaborative Filtering	Item-Based Collaborative Filtering
Definition	Finds users similar to the target user and recommends items liked by those similar users.	Finds items similar to the ones the user has interacted with and recommends those similar items.
Similarity Calculation	Compares users based on their past interactions with items.	Compares items based on how users have interacted with them.
Example	If User A and User B have similar tastes, and User B liked a movie, then User A is likely to enjoy it too.	If two movies are frequently watched by the same users, then a user who watches one may be recommended the other.
Scalability	Less scalable due to the high number of users and dynamic nature of user preferences.	More scalable because item relationships remain relatively stable over time.
Common Algorithms	Pearson Correlation, Cosine Similarity on user-item matrix.	Cosine Similarity, Jaccard Similarity on item-user matrix.

2.What is Collaborative Filtering, and How Does It Work?
Collaborative filtering is a recommendation technique that suggests items based on the preferences and behaviors of users. It works on the assumption that if two users agree on one item, they are likely to agree on other items as well.

How It Works:
Collect Data – User interactions (e.g., ratings, purchases, clicks) are collected.
Create a Matrix – A user-item interaction matrix is created where rows represent users, and columns represent items.
Compute Similarity – Based on past interactions, user similarity (user-based CF) or item similarity (item-based CF) is calculated.
Generate Recommendations – Using the similarity scores, predictions are made for items that a user may like.
Rank & Present – The system ranks the recommendations and presents the most relevant ones.
Types of Collaborative Filtering:
Memory-Based – Directly uses similarity measures (e.g., Pearson correlation, cosine similarity).
Model-Based – Uses machine learning techniques (e.g., matrix factorization, deep learning) to learn latent patterns and make recommendations.