In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, LinearRegression, Lasso, RidgeCV, LassoCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV


In [None]:
data = pd.read_csv('dataset_case2.csv', index_col='Unnamed: 0')

In [None]:
import random
import pandas as pd

data = pd.read_csv('dataset_case2.csv', index_col='Unnamed: 0')

users = tuple(data['insta_user_id'].unique())
test_size = 20                          
test_users = random.sample(users, test_size)

df_test = data[data['insta_user_id'].isin(test_users)]
df_train = data[~data['insta_user_id'].isin(test_users)]

y_train = df_train['PERMA']
X_train = df_train.drop(['PERMA','P','E','R','M','A','image_id','user_id','insta_user_id'], axis=1)

y_test = df_test['PERMA']
X_test = df_test.drop(['PERMA','P','E','R','M','A','image_id','user_id','insta_user_id'], axis=1)

**OLS**

In [None]:
ols = LinearRegression()
cv = cross_val_score(ols, X_train, y_train, cv=10)  
print('cross validation scores : ', cv)

In [None]:
ols_model = ols.fit(X_train, y_train)
print('Train score: ', ols.score(X_train, y_train))
print('Test score: ', ols.score(X_test, y_test))

In [None]:
#biggest features
ols_coefficients = pd.DataFrame(ols_model.coef_).transpose()
ols_coefficients['feature_names']=(X_train.columns)
ols_coefficients.nlargest(20, columns=0)

# only filters as most important features!!!

In [None]:
len(ols_coefficients)

In [None]:
plt.scatter(range(X_train.shape[1]), ols_model.coef_, c=np.sign(ols_model.coef_), cmap="bwr_r")
plt.xticks(np.arange(79), X_train.columns, fontsize=6, rotation=90)
plt.title('OLS coefficients')

**Ridge**

In [None]:
param_grid = {'alpha': np.logspace(-3,15, 14)}
grid = GridSearchCV(Ridge(), param_grid, cv=10, return_train_score=True)
grid.fit(X_train, y_train)

In [None]:
results = pd.DataFrame(grid.cv_results_)
results.plot('param_alpha', 'mean_train_score')
results.plot('param_alpha', 'mean_test_score', ax=plt.gca())

plt.legend()
plt.ylabel('R^2')
plt.xscale("log")

In [None]:
grid.cv_results_['mean_train_score']

In [None]:
grid.cv_results_['mean_test_score']

In [None]:
print(grid.best_params_)
print('best test score: ', grid.best_score_)

In [None]:
print('best estimator train score :' , grid.score(X_train, y_train))
print('best estimator test score :' , grid.score(X_test, y_test))

In [None]:
#biggest features
ridge_coefficients = pd.DataFrame(grid.best_estimator_.coef_).transpose()
ridge_coefficients['feature_names']=(X_train.columns)
ridge_coefficients.nlargest(5, columns=0)

In [None]:
plt.scatter(range(X_train.shape[1]), grid.best_estimator_.coef_, c=np.sign(grid.best_estimator_.coef_), cmap="bwr_r")
plt.xticks(np.arange(79), X_train.columns, fontsize=6, rotation=90)
plt.title('best Ridge coefficients')

**Lasso**

In [None]:
param_grid = {'alpha': np.logspace(-3,6, 14)}
grid = GridSearchCV(Lasso(), param_grid, cv=10, return_train_score=True)
grid.fit(X_train, y_train)

In [None]:
results = pd.DataFrame(grid.cv_results_)
results.plot('param_alpha', 'mean_train_score')
results.plot('param_alpha', 'mean_test_score', ax=plt.gca())

plt.legend()
plt.ylabel('R^2')
plt.xscale("log")


In [None]:
print(grid.best_params_)
print('best test score: ', grid.best_score_)

In [None]:
print('best estimator train score :' , grid.score(X_train, y_train))
print('best estimator test score :' , grid.score(X_test, y_test))

In [None]:
#biggest features
lasso_coefficients = pd.DataFrame(grid.best_estimator_.coef_)
lasso_coefficients['feature_names']=(X_train.columns)
lasso_coefficients.nlargest(5, columns=0)