# Import Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/pantheon-project/database.csv")
df.head(7)

# Data Understanding

In [None]:
df.info()

In [None]:
print('Missing Value in Each Column:')
print(df.isna().sum())

In [None]:
df.describe()

# Quick Feature Selection

In [None]:
df.drop(columns=['article_id','full_name','state','latitude','longitude','city'],inplace=True)
df = df.dropna().reset_index(drop=True)

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [None]:
encode = OrdinalEncoder()
encode.fit(df[['sex','country','continent','occupation','industry','domain']])
df[['sex','country','continent','occupation','industry','domain']] = encode.transform(df[['sex','country','continent','occupation','industry','domain']])

In [None]:
df.loc[df["birth_year"] == "530s", "birth_year"] = 530
df.loc[df["birth_year"] == "1237?", "birth_year"] = 1237
df = df.drop(df.index[df.loc[:,"birth_year"] == "Unknown"])
df.loc[:, "birth_year"] = pd.to_numeric(df.loc[:,"birth_year"])

In [None]:
df.head()

In [None]:
df.info()

# Modeling

## Dataset Splitting

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df.drop(columns=['historical_popularity_index'])
y = df['historical_popularity_index']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

## Learning 

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [None]:
param_rf = {
    'n_estimators':[100,150,200],
    'max_depth':[20,50,80],
    'max_features':[0.3,0.6,0.8],
    'min_samples_leaf':[1,5,10],
}

In [None]:
rf = RandomForestRegressor(n_jobs=-1,random_state=42)

In [None]:
model_rf = GridSearchCV(rf,param_rf,cv=3,n_jobs=-1,verbose=1)
model_rf.fit(X_train,y_train)

print(model_rf.best_params_)
print("Train data R squared score: ", model_rf.score(X_train,y_train))
print("Test data R squared score: ", model_rf.score(X_test,y_test))

# Evaluation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
y_pred = model_rf.predict(X_test)

In [None]:
mae = mean_absolute_error(y_test,y_pred)
mse = mean_squared_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

In [None]:
print("The model performance for testing set")
print("--------------------------------------")
print('MAE is {}'.format(mae))
print('MSE is {}'.format(mse))
print('R2 score is {}'.format(r2))

# Feature Importances Based on Model

In [None]:
rf_selected = RandomForestRegressor(max_depth=50, max_features=0.6, min_samples_leaf=1, 
                                    n_estimators=200,n_jobs=-1,random_state=42) 

In [None]:
# Random Forest Built In

rf_selected.fit(X_train, y_train)

sorted_idx = rf_selected.feature_importances_.argsort()

fig = plt.subplots(figsize=(10,5))
plt.barh(X_train.columns[sorted_idx], rf_selected.feature_importances_[sorted_idx])
plt.xlabel("Feature Importance Score")
plt.ylabel("Feature")
plt.title("Feature Importances with Built In")

plt.show()

In [None]:
# Permutation

from sklearn.inspection import  permutation_importance

perm_importance = permutation_importance(rf_selected, X_test, y_test)

sorted_idx = perm_importance.importances_mean.argsort()

fig = plt.subplots(figsize=(10,5))
plt.barh(X_train.columns[sorted_idx], perm_importance.importances_mean[sorted_idx])
plt.xlabel("Permutation Importance")
plt.xlabel("Feature Importance Score")
plt.ylabel("Feature")
plt.title("Feature Importances with Permutation")

plt.show()