In [0]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet

from sklearn.metrics import r2_score, mean_squared_error
import pickle

In [0]:
MODEL_PATH = "/content/drive/My Drive/Colab Notebooks/data_personality/IPIP-FFM-data-8Nov2018/models/"
DATASET2_PATH = "/content/drive/My Drive/Colab Notebooks/data_personality/IPIP-FFM-data-8Nov2018/dataset/"
FB_DATA_PATH = "/content/drive/My Drive/Colab Notebooks/data_personality/IPIP-FFM-data-8Nov2018/fb_status_data.csv"

In [22]:
df = pd.read_csv(DATASET2_PATH + 'final_df.csv', encoding="ISO-8859-1")
X_df = df['status_update']
y_df = df.drop(['userid', 'status_update'], axis=1)
print(X_df.shape, y_df.shape)

(560237,) (560237, 5)


In [0]:
fb_status_dataset = pd.read_csv(FB_DATA_PATH, encoding="ISO-8859-1")
X_test = fb_status_dataset['STATUS']
drop_list = ['#AUTHID', 'STATUS', 'cEXT',
       'cNEU', 'cAGR', 'cCON', 'cOPN', 'DATE', 'NETWORKSIZE', 'BETWEENNESS',
       'NBETWEENNESS', 'DENSITY', 'BROKERAGE', 'NBROKERAGE', 'TRANSITIVITY']
#print(list_)
y_test = fb_status_dataset.drop(drop_list, axis=1)

In [30]:
class Model1_rf():
    def __init__(self):
      #self.lr = LinearRegression(normalize=True)
      '''
        self.rfr = RandomForestRegressor(bootstrap=True,
         max_features='sqrt',
         min_samples_leaf=1,
         min_samples_split=2,
         n_estimators= 200)
         '''
      self.lr = Ridge(normalize=True)
      self.tfidf = TfidfVectorizer(stop_words='english', strip_accents='ascii')

    def fit(self, X, y, regression=True):
      X = self.tfidf.fit_transform(X.values.astype('U'))
      if regression:
        self.lr = self.lr.fit(X, y)

    def predict(self, X, regression=True):
      X = self.tfidf.transform(X.values.astype('U'))
      if regression:
        return self.lr.predict(X)

traits = ['sOPN', 'sCON', 'sEXT', 'sAGR', 'sNEU']
model1 = Model1_rf()
y_pred = {}
y_pred_train = {}
for trait in traits:
  X_regression, y_regression = X_df, y_df[trait]
  print(X_regression.shape)
  print('Fitting trait ' + trait + ' regression model...')
  model1.fit(X_regression, y_regression, regression=True)
  print('Done!')

  y_pred[trait] = model1.predict(X_test)
  val_score = r2_score(y_test[trait], y_pred[trait])
  mse = mean_squared_error(y_test[trait], y_pred[trait])

  y_pred_train[trait] = model1.predict(X_df)
  val_score_train = r2_score(y_df[trait], y_pred_train[trait])
  mse_train = mean_squared_error(y_df[trait], y_pred_train[trait])

  print("r2_score_test : ", val_score)
  print("r2_score_train : ", val_score_train)
  print("MSE_test : ", mse)
  print("MSE_train : ", mse_train)

  with open(MODEL_PATH + 'linreg/' + trait + '_model_lr.pkl', 'wb') as f:
            # Write the model to a file.
    pickle.dump(model1, f)


(560237,)
Fitting trait sOPN regression model...
Done!
r2_score_test :  -0.19989480641376778
r2_score_train :  0.3279534783348973
MSE_test :  0.4115361651592359
MSE_train :  0.30775934693157614
(560237,)
Fitting trait sCON regression model...
Done!
r2_score_test :  -0.04858927333100582
r2_score_train :  0.3074224041274837
MSE_test :  0.5698366538370702
MSE_train :  0.3861424734794879
(560237,)
Fitting trait sEXT regression model...
Done!
r2_score_test :  -0.09806560083085714
r2_score_train :  0.3128478196509369
MSE_test :  0.8074792592907872
MSE_train :  0.4585487753644267
(560237,)
Fitting trait sAGR regression model...
Done!
r2_score_test :  -0.05847464758990206
r2_score_train :  0.3115053220852132
MSE_test :  0.4929726343332864
MSE_train :  0.3384468719730774
(560237,)
Fitting trait sNEU regression model...
Done!
r2_score_test :  -0.1062652683127181
r2_score_train :  0.2943586461853236
MSE_test :  0.639330726429574
MSE_train :  0.4564454885105973
