In [1]:
import numpy as np
import pandas as pd

# Input File

In [2]:
df_input = pd.read_csv('tcd ml 2019-20 income prediction training (with labels).csv')
df_input.head()

Unnamed: 0,Instance,Year of Record,Gender,Age,Country,Size of City,Profession,University Degree,Wears Glasses,Hair Color,Body Height [cm],Income in EUR
0,1,1997.0,0,41.0,Belarus,1239930,steel workers,Bachelor,0,Blond,193,61031.94416
1,2,1996.0,other,41.0,Singapore,1603504,safe event coordinator,Master,0,Black,186,91001.32764
2,3,2018.0,other,28.0,Norway,1298017,receivables/payables analyst,PhD,1,Brown,170,157982.1767
3,4,2006.0,other,33.0,Cuba,751903,fleet assistant,No,1,Black,171,45993.75793
4,5,2010.0,female,46.0,United Arab Emirates,95389,lead trainer,0,0,Blond,188,38022.16217


## Pre Processing
### Removed All Columns with Null String Values, Applied median in numberic values and OneHot to String Columns

In [3]:
df_input = df_input.fillna(df_input.median())
df_input = df_input.dropna(subset=["Gender","Profession", "University Degree", "Hair Color"], how = "any")
# OneHot
df_input = pd.concat((df_input,pd.get_dummies(df_input['Hair Color'])),1)
del df_input['Hair Color']
df_input = pd.concat((df_input,pd.get_dummies(df_input.Gender)),1)
del df_input['Gender']
df_input = pd.concat((df_input,pd.get_dummies(df_input.Profession)),1)
del df_input['Profession']
df_input = pd.concat((df_input,pd.get_dummies(df_input['University Degree'])),1)
del df_input['University Degree']
df_input = pd.concat((df_input,pd.get_dummies(df_input['Country'])),1)
del df_input['Country']
df_input.shape

(91151, 1511)

# Test File

In [4]:
df_test = pd.read_csv('Data/tcd ml 2019-20 income prediction test (without labels).csv')
df_test.head()

Unnamed: 0,Instance,Year of Record,Gender,Age,Country,Size of City,Profession,University Degree,Wears Glasses,Hair Color,Body Height [cm],Income
0,111994,1992.0,other,21.0,Honduras,391652,senior project analyst,Master,1,Brown,153,
1,111995,1986.0,other,34.0,Kyrgyzstan,33653,greeter,Bachelor,0,Black,163,
2,111996,1994.0,unknown,53.0,Portugal,34765,liaison,Bachelor,1,Blond,153,
3,111997,1984.0,0,29.0,Uruguay,1494132,occupational therapist,No,0,Black,154,
4,111998,2007.0,other,17.0,Serbia,120661,portfolio manager,No,0,Red,191,


## Pre Processing 
### Same as above

In [5]:
df_test = df_test.fillna(df_test.median())
# OneHot
df_test = pd.concat((df_test,pd.get_dummies(df_test['Hair Color'])),1)
del df_test['Hair Color']
df_test = pd.concat((df_test,pd.get_dummies(df_test.Gender)),1)
del df_test['Gender']
df_test = pd.concat((df_test,pd.get_dummies(df_test.Profession)),1)
del df_test['Profession']
df_test = pd.concat((df_test,pd.get_dummies(df_test['University Degree'])),1)
del df_test['University Degree']
df_test = pd.concat((df_test,pd.get_dummies(df_test['Country'])),1)
del df_test['Country']
df_test.shape

(73230, 1505)

### Removed Columns which are in not common in both the tables

In [6]:
del_val = list(set(list(df_input.columns) ) - set(list(df_test.columns) ) - set(['Income in EUR']))
del_val = np.asarray(del_val)
df_input.drop(del_val, axis=1, inplace=True)
df_input.shape

(91151, 1476)

In [7]:
del_val = list(set(list(df_test.columns) ) - set(list(df_input.columns) ) - set(['Income']))
del_val = np.asarray(del_val)
df_test.drop(del_val, axis=1, inplace=True)
df_test.shape

(73230, 1476)

### Applying Linear Regression

In [8]:
from sklearn.linear_model import LinearRegression

In [9]:
X = df_input.loc[:, df_input.columns != 'Income in EUR'].values
y = df_input["Income in EUR"].values

test_size = 30000

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

In [10]:
clf = LinearRegression().fit(X_train, y_train)
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test))

0.6967193942336425


### Predicting Data

In [11]:
test_X = df_test.loc[:, df_test.columns != 'Income'].values
prediction_value = clf.predict(test_X)

In [12]:
df_test['Income'] = prediction_value
file = df_test[['Instance', 'Income']]

### Exporting Data to File

In [13]:
export_csv = file.to_csv("tcd ml 2019-20 income prediction submission file.csv", index = None, header = True)