In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor

In [2]:
data = pd.read_csv("data/final_data.csv")

In [3]:
X = data.drop(columns="INCOME")
y = data["INCOME"]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Linear Regression

In [5]:
LR = linear_model.LinearRegression().fit(X_train, y_train)
LR.score(X_test, y_test)

-1.677398569889651e+19

### Ridge Regression

In [6]:
RR = linear_model.Ridge(alpha=85, random_state=42).fit(X_train, y_train)
RR.score(X_test, y_test)

0.15757294603748428

### Lasso

In [7]:
LAS = linear_model.Lasso(alpha=85, random_state=42).fit(X_train, y_train)
LAS.score(X_test, y_test)

0.15091859140242336

### Random Forest

In [8]:
RF = RandomForestRegressor(max_depth=6, random_state=42).fit(X_train, y_train)
RF.score(X_test, y_test)

0.14987449437771827

EXPORTING THE MODEL

In [9]:
with open('model_pickle', 'wb') as f:
    pickle.dump(RR, f)

# Without FE for Major

In [10]:
data2 = pd.read_csv("data/no_major_final.csv")
data2.head()

Unnamed: 0,SIBS,DEGREE_BACHELOR,DEGREE_GRADUATE,DEGREE_HIGH SCHOOL,DEGREE_JUNIOR COLLEGE,PADEG_BACHELOR,PADEG_GRADUATE,PADEG_HIGH SCHOOL,PADEG_JUNIOR COLLEGE,PADEG_LT HIGH SCHOOL,...,GRANBORN_4.0,GRANBORN_ALL IN U.S,POLVIEWS_CONSERVATIVE,POLVIEWS_EXTREMELY LIBERAL,POLVIEWS_EXTRMLY CONSERVATIVE,POLVIEWS_LIBERAL,POLVIEWS_MODERATE,POLVIEWS_SLGHTLY CONSERVATIVE,POLVIEWS_SLIGHTLY LIBERAL,INCOME
0,0.0,1,0,0,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,30000
1,2.0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,6000
2,3.0,0,0,0,1,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,90000
3,4.0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,60000
4,2.0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,30000


In [11]:
X2 = data2.drop(columns="INCOME")
y2 = data2["INCOME"]

In [12]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

### Linear Regression

In [13]:
LR2 = linear_model.LinearRegression().fit(X2_train, y2_train)
LR2.score(X2_test, y2_test)

-5.515405562852804e+18

### Ridge Regression

In [14]:
RR2 = linear_model.Ridge(alpha=90, random_state=42).fit(X2_train, y2_train)
RR2.score(X2_test, y2_test)

0.11425950601755575

### Lasso

In [15]:
LAS2 = linear_model.Lasso(alpha=500, random_state=42).fit(X2_train, y2_train)
LAS2.score(X2_test, y2_test)

0.1114772292514451

### Random Forest

In [16]:
RF2 = RandomForestRegressor(max_depth=2, random_state=42).fit(X2_train, y2_train)
RF2.score(X2_test, y2_test)

0.07222040405741348

# TF/Keras

In [37]:
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.utils import plot_model
from sklearn import metrics
import matplotlib.pyplot as plt

In [29]:
modelff = Sequential()
modelff.add(Dense(20, input_shape=(128,), activation="relu"))
modelff.add(Dense(5, activation="relu"))
modelff.add(Dense(1))

modelff.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])

In [19]:
modelff.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 20)                2580      
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 105       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 6         
Total params: 2,691
Trainable params: 2,691
Non-trainable params: 0
_________________________________________________________________


In [30]:
num_epochs = 100
history = modelff.fit(X_train, y_train.values)



In [22]:
data.head()

Unnamed: 0,SIBS,DEGREE_BACHELOR,DEGREE_GRADUATE,DEGREE_HIGH SCHOOL,DEGREE_JUNIOR COLLEGE,PADEG_BACHELOR,PADEG_GRADUATE,PADEG_HIGH SCHOOL,PADEG_JUNIOR COLLEGE,PADEG_LT HIGH SCHOOL,...,GRANBORN_4.0,GRANBORN_ALL IN U.S,POLVIEWS_CONSERVATIVE,POLVIEWS_EXTREMELY LIBERAL,POLVIEWS_EXTRMLY CONSERVATIVE,POLVIEWS_LIBERAL,POLVIEWS_MODERATE,POLVIEWS_SLGHTLY CONSERVATIVE,POLVIEWS_SLIGHTLY LIBERAL,INCOME
0,1.0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,8000
1,6.0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,7000
2,0.0,0,1,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,1,50000
3,8.0,0,0,1,0,0,0,0,0,1,...,0,1,0,0,0,0,1,0,0,25000
4,7.0,1,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,75000


In [23]:
data["INCOME_pred"] = RR.predict(data.drop(columns="INCOME"))

In [24]:
data.head()

Unnamed: 0,SIBS,DEGREE_BACHELOR,DEGREE_GRADUATE,DEGREE_HIGH SCHOOL,DEGREE_JUNIOR COLLEGE,PADEG_BACHELOR,PADEG_GRADUATE,PADEG_HIGH SCHOOL,PADEG_JUNIOR COLLEGE,PADEG_LT HIGH SCHOOL,...,GRANBORN_ALL IN U.S,POLVIEWS_CONSERVATIVE,POLVIEWS_EXTREMELY LIBERAL,POLVIEWS_EXTRMLY CONSERVATIVE,POLVIEWS_LIBERAL,POLVIEWS_MODERATE,POLVIEWS_SLGHTLY CONSERVATIVE,POLVIEWS_SLIGHTLY LIBERAL,INCOME,INCOME_pred
0,1.0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,8000,54881.141374
1,6.0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,7000,24013.531877
2,0.0,0,1,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,1,50000,63645.460282
3,8.0,0,0,1,0,0,0,0,0,1,...,1,0,0,0,0,1,0,0,25000,31703.561658
4,7.0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,1,0,0,0,75000,43781.411405
