In [1]:
import pandas as pd
import numpy as np

In [2]:
import pandas as pd

users_df = pd.read_csv('users.csv')
likes_df = pd.read_csv('likes.csv')
users_likes_df = pd.read_csv('users-likes.csv')

filtered_users_likes_df = users_likes_df[users_likes_df['userid'].isin(users_df['userid'])]

merged_df = filtered_users_likes_df.merge(likes_df, left_on='likeid', right_on='likeid')

final_df = merged_df.merge(users_df, left_on='userid', right_on='userid')

final_df.to_csv('final.csv', index=False)

In [3]:

final_df = pd.read_csv('final.csv')

num_rows = final_df.shape[0]

print(f'The final dataset contains {num_rows} rows.')

The final dataset contains 10612326 rows.


In [4]:
final_df.dropna(inplace=True)

num_rows = final_df.shape[0]

print(f'The final dataset contains {num_rows} rows.')

The final dataset contains 2571490 rows.


In [5]:
na_values = final_df.isna().sum()

print(na_values)

any_na = final_df.isna().any().any()
print(f'Are there any NA values left in the dataset? {any_na}')

userid       0
likeid       0
name         0
gender       0
age          0
political    0
ope          0
con          0
ext          0
agr          0
neu          0
dtype: int64
Are there any NA values left in the dataset? False


In [6]:
print("First few rows for a sanity check:")
print(final_df.head())
print("\nLast few rows for a sanity check:")
print(final_df.tail())

duplicate_rows = final_df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_rows}")

print("\nDescriptive statistics for numerical columns:")
print(final_df.describe())

print("\nValue counts for categorical columns:")
for col in final_df.select_dtypes(include=['object']).columns:
    print(f"\nValue counts for {col}:")
    print(final_df[col].value_counts())

print("\nData types of each column:")
print(final_df.dtypes)

First few rows for a sanity check:
                                userid                            likeid  \
7774  ce110562b3e2f7e5cad3775b32d9caa5  b65f46d64c688fe98bdbcf93a76a71fc   
7775  ce110562b3e2f7e5cad3775b32d9caa5  295533f33bf160e76d7e95efadfe196c   
7776  ce110562b3e2f7e5cad3775b32d9caa5  a4f3b9c60196cb0920b2868d057c7961   
7777  ce110562b3e2f7e5cad3775b32d9caa5  1c88edef6a3d9b9fba1381f3db085dc9   
7778  ce110562b3e2f7e5cad3775b32d9caa5  e327fc881114e13a28e7513f6b85c4b1   

                                              name  gender  age  political  \
7774  Yo también me rei de la caída de otro jejeje       1   27        0.0   
7775                                       Titanic       1   27        0.0   
7776                                  Skinny Jeans       1   27        0.0   
7777                                        iTunes       1   27        0.0   
7778                         Official Grease Movie       1   27        0.0   

       ope   con   ext   agr   neu  
77

In [7]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

encoder = OneHotEncoder(sparse=True) 
likes_matrix = encoder.fit_transform(final_df[['likeid']])

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=100, random_state=42) 
likes_matrix_reduced = svd.fit_transform(likes_matrix)



In [17]:
personality_scores_min_max = final_df[['ope', 'con', 'ext', 'agr', 'neu']].agg(['min', 'max'])
print(personality_scores_min_max)

          ope       con       ext       agr       neu
min -5.736013 -3.534098 -3.148309 -4.092900 -2.192688
max  1.635284  2.290445  1.831796  2.313524  2.718107


In [18]:
anomalies = final_df.isnull().sum()
print("Anomalies in each column after merging:")
print(anomalies)

Anomalies in each column after merging:
userid       0
likeid       0
name         0
gender       0
age          0
political    0
ope          0
con          0
ext          0
agr          0
neu          0
dtype: int64


In [19]:
unique_users = final_df['userid'].nunique()
unique_likes = final_df['likeid'].nunique()
print(f"Unique users: {unique_users}, Unique likes: {unique_likes}")


Unique users: 27802, Unique likes: 587293


In [20]:
numerical_stats = final_df.describe()
print(numerical_stats)

             gender           age     political           ope           con  \
count  2.571490e+06  2.571490e+06  2.571490e+06  2.571490e+06  2.571490e+06   
mean   5.844565e-01  2.837862e+01  3.019063e-01  1.165389e-16 -7.604205e-18   
std    4.928156e-01  7.800643e+00  4.590849e-01  1.000000e+00  1.000000e+00   
min    0.000000e+00  1.800000e+01  0.000000e+00 -5.736013e+00 -3.534098e+00   
25%    0.000000e+00  2.400000e+01  0.000000e+00 -6.688696e-01 -6.957417e-01   
50%    1.000000e+00  2.600000e+01  0.000000e+00  1.590299e-01  3.355808e-02   
75%    1.000000e+00  3.000000e+01  1.000000e+00  8.073849e-01  6.938700e-01   
max    1.000000e+00  7.900000e+01  1.000000e+00  1.635284e+00  2.290445e+00   

                ext           agr           neu  
count  2.571490e+06  2.571490e+06  2.571490e+06  
mean   7.460521e-18  2.309998e-17  7.825257e-17  
std    1.000000e+00  1.000000e+00  1.000000e+00  
min   -3.148309e+00 -4.092900e+00 -2.192688e+00  
25%   -6.582565e-01 -6.073379e-01 -7.1

In [21]:
print(final_df.dtypes)

userid        object
likeid        object
name          object
gender         int64
age            int64
political    float64
ope          float64
con          float64
ext          float64
agr          float64
neu          float64
dtype: object


In [23]:
duplicate_ids = final_df['userid'].duplicated().sum()
print(f"Duplicate user IDs found: {duplicate_ids}")

Duplicate user IDs found: 2543688


In [24]:
duplicate_likes_per_user = final_df.duplicated(subset=['userid', 'likeid']).sum()
print(f"Duplicate likes per user found: {duplicate_likes_per_user}")

Duplicate likes per user found: 0


In [25]:
logical_inconsistencies = (final_df['age'] < 0) | (final_df['ope'] < -5.74) | (final_df['ope'] > 1.64)
print(f"Logical inconsistencies found: {logical_inconsistencies.sum()}")

Logical inconsistencies found: 0


In [26]:
from sklearn.model_selection import train_test_split

X = likes_matrix_reduced  
y = final_df[['age', 'gender', 'political']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(random_state=42)
 
rf_clf.fit(X_train, y_train['gender'])

gender_predictions = rf_clf.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report
print(f"Accuracy: {accuracy_score(y_test['gender'], gender_predictions)}")
print(classification_report(y_test['gender'], gender_predictions))

Accuracy: 0.6004573224084091
              precision    recall  f1-score   support

           0       0.59      0.13      0.22    213806
           1       0.60      0.93      0.73    300492

    accuracy                           0.60    514298
   macro avg       0.59      0.53      0.47    514298
weighted avg       0.60      0.60      0.52    514298



In [28]:
from joblib import dump

dump(rf_clf, 'rf_clf_gender.joblib')

['rf_clf_gender.joblib']

In [31]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=63)
rf_clf.fit(X_train, y_train[['gender', 'political']]) 

predictions = rf_clf.predict(X_test)

gender_predictions = predictions[:, 0]
political_predictions = predictions[:, 1]

In [32]:
from sklearn.metrics import accuracy_score, classification_report

print(f"Accuracy for gender prediction: {accuracy_score(y_test['gender'], gender_predictions)}")
print(classification_report(y_test['gender'], gender_predictions))

print(f"Accuracy for political prediction: {accuracy_score(y_test['political'], political_predictions)}")
print(classification_report(y_test['political'], political_predictions))

Accuracy for gender prediction: 0.6002881597828497
              precision    recall  f1-score   support

           0       0.59      0.13      0.21    213806
           1       0.60      0.93      0.73    300492

    accuracy                           0.60    514298
   macro avg       0.59      0.53      0.47    514298
weighted avg       0.60      0.60      0.52    514298

Accuracy for political prediction: 0.702798766473912
              precision    recall  f1-score   support

         0.0       0.70      0.99      0.82    359284
         1.0       0.64      0.03      0.06    155014

    accuracy                           0.70    514298
   macro avg       0.67      0.51      0.44    514298
weighted avg       0.69      0.70      0.59    514298



In [None]:
'''from sklearn.model_selection import train_test_split
X_sample, _, y_sample, _ = train_test_split(X, y, test_size=0.8, random_state=42)

X_train_sample, X_test_sample, y_train_sample, y_test_sample = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

rf_clf = RandomForestClassifier(random_state=42)

rf_clf.fit(X_train, y_train[['gender', 'political']]) 

predictions = rf_clf.predict(X_test)

gender_predictions = predictions[:, 0]
political_predictions = predictions[:, 1]

print(f"Accuracy for gender prediction: {accuracy_score(y_test['gender'], gender_predictions)}")
print(classification_report(y_test['gender'], gender_predictions))

print(f"Accuracy for political prediction: {accuracy_score(y_test['political'], political_predictions)}")
print(classification_report(y_test['political'], political_predictions))
'''