# Income Prediction 

In [1]:
import numpy as np
import pandas as pd

## Import Training Data

In [2]:
df_input = pd.read_csv('tcd ml 2019-20 income prediction training (with labels).csv')
df_input.head()

Unnamed: 0,Instance,Year of Record,Gender,Age,Country,Size of City,Profession,University Degree,Wears Glasses,Hair Color,Body Height [cm],Income in EUR
0,1,1997.0,0,41.0,Belarus,1239930,steel workers,Bachelor,0,Blond,193,61031.94416
1,2,1996.0,other,41.0,Singapore,1603504,safe event coordinator,Master,0,Black,186,91001.32764
2,3,2018.0,other,28.0,Norway,1298017,receivables/payables analyst,PhD,1,Brown,170,157982.1767
3,4,2006.0,other,33.0,Cuba,751903,fleet assistant,No,1,Black,171,45993.75793
4,5,2010.0,female,46.0,United Arab Emirates,95389,lead trainer,0,0,Blond,188,38022.16217


## Import Test Data

In [3]:
df_test = pd.read_csv('Data/tcd ml 2019-20 income prediction test (without labels).csv')
df_test.head()

Unnamed: 0,Instance,Year of Record,Gender,Age,Country,Size of City,Profession,University Degree,Wears Glasses,Hair Color,Body Height [cm],Income
0,111994,1992.0,other,21.0,Honduras,391652,senior project analyst,Master,1,Brown,153,
1,111995,1986.0,other,34.0,Kyrgyzstan,33653,greeter,Bachelor,0,Black,163,
2,111996,1994.0,unknown,53.0,Portugal,34765,liaison,Bachelor,1,Blond,153,
3,111997,1984.0,0,29.0,Uruguay,1494132,occupational therapist,No,0,Black,154,
4,111998,2007.0,other,17.0,Serbia,120661,portfolio manager,No,0,Red,191,


### Column list Values

In [4]:
np.asarray(list(df_input.columns))

array(['Instance', 'Year of Record', 'Gender', 'Age', 'Country',
       'Size of City', 'Profession', 'University Degree', 'Wears Glasses',
       'Hair Color', 'Body Height [cm]', 'Income in EUR'], dtype='<U17')

### Deleting Hair Color for better performance

In [5]:
del df_input['Hair Color']

In [6]:
del df_test['Hair Color']

### Checking Null Value Count

In [7]:
df_input.isnull().sum()

Instance                0
Year of Record        441
Gender               7432
Age                   494
Country                 0
Size of City            0
Profession            322
University Degree    7370
Wears Glasses           0
Body Height [cm]        0
Income in EUR           0
dtype: int64

In [8]:
df_test.isnull().sum()

Instance                 0
Year of Record         295
Gender                4862
Age                    279
Country                  0
Size of City             0
Profession             195
University Degree     4857
Wears Glasses            0
Body Height [cm]         0
Income               73230
dtype: int64

### Filling Missing Values with mean value

In [9]:
df_input = df_input.fillna(df_input.mean())

In [10]:
df_test = df_test.fillna(df_test.mean())

### Filling Missing string Value with mode value

In [11]:
df_input["Gender"] = df_input["Gender"].fillna(df_input["Gender"].mode()[0])
df_input["Profession"] = df_input["Profession"].fillna(df_input["Profession"].mode()[0])
df_input["University Degree"] = df_input["University Degree"].fillna(df_input["University Degree"].mode()[0])

In [12]:
df_test["Gender"] = df_test["Gender"].fillna(df_test["Gender"].mode()[0])
df_test["Profession"] = df_test["Profession"].fillna(df_test["Profession"].mode()[0])
df_test["University Degree"] = df_test["University Degree"].fillna(df_test["University Degree"].mode()[0])

### Checking if NULL Values are gone or not

In [13]:
df_input.isnull().sum()

Instance             0
Year of Record       0
Gender               0
Age                  0
Country              0
Size of City         0
Profession           0
University Degree    0
Wears Glasses        0
Body Height [cm]     0
Income in EUR        0
dtype: int64

In [14]:
df_test.isnull().sum()

Instance                 0
Year of Record           0
Gender                   0
Age                      0
Country                  0
Size of City             0
Profession               0
University Degree        0
Wears Glasses            0
Body Height [cm]         0
Income               73230
dtype: int64

### Applying filter to reduce Garbage Value

In [15]:
from scipy.signal import lfilter

n = 125  # the larger n is, the smoother curve will be
b = [1.0 / n] * n
a = 1
df_input[["Year of Record", "Age", "Size of City", "Wears Glasses", "Body Height [cm]"]] = lfilter(b,a,df_input[["Year of Record", "Age", "Size of City", "Wears Glasses", "Body Height [cm]"]])

In [16]:
from scipy.signal import lfilter

n = 125  # the larger n is, the smoother curve will be
b = [1.0 / n] * n
a = 1
df_test[["Year of Record", "Age", "Size of City", "Wears Glasses", "Body Height [cm]"]] = lfilter(b,a,df_test[["Year of Record", "Age", "Size of City", "Wears Glasses", "Body Height [cm]"]])

### Cheking Values after filter

In [17]:
df_input.head()

Unnamed: 0,Instance,Year of Record,Gender,Age,Country,Size of City,Profession,University Degree,Wears Glasses,Body Height [cm],Income in EUR
0,1,15.976,0,16.304,Belarus,9935.744,steel workers,Bachelor,9935.744,9937.288,61031.94416
1,2,15.968,other,16.296,Singapore,12844.328,safe event coordinator,Master,12844.328,12845.816,91001.32764
2,3,16.144,other,16.368,Norway,10400.504,receivables/payables analyst,PhD,10400.512,10401.872,157982.1767
3,4,16.048,other,16.312,Cuba,6031.536,fleet assistant,No,6031.544,6032.912,45993.75793
4,5,16.08,female,16.448,United Arab Emirates,779.56,lead trainer,0,779.56,781.064,38022.16217


In [18]:
df_test.head()

Unnamed: 0,Instance,Year of Record,Gender,Age,Country,Size of City,Profession,University Degree,Wears Glasses,Body Height [cm],Income
0,111994,15.936,other,16.104,Honduras,3149.32,senior project analyst,Master,3149.328,3150.552,
1,111995,15.888,other,16.16,Kyrgyzstan,285.384,greeter,Bachelor,285.384,286.688,
2,111996,15.952,unknown,16.376,Portugal,294.496,liaison,Bachelor,294.504,295.728,
3,111997,15.872,0,16.104,Uruguay,11969.16,occupational therapist,No,11969.16,11970.392,
4,111998,16.056,other,16.192,Serbia,981.48,portfolio manager,No,981.48,983.008,


### Replace 0 with unknown in Gender Column

In [19]:
df_input.Gender = df_input.Gender.replace('0', 'unknown')
df_test.Gender = df_test.Gender.replace('0', 'unknown')

### Applying OneHot on Columns with String Values

In [20]:
df_input = pd.concat((df_input,pd.get_dummies(df_input.Gender)),1)
del df_input['Gender']
df_input = pd.concat((df_input,pd.get_dummies(df_input.Profession)),1)
del df_input['Profession']
df_input = pd.concat((df_input,pd.get_dummies(df_input['University Degree'])),1)
del df_input['University Degree']
df_input = pd.concat((df_input,pd.get_dummies(df_input['Country'])),1)
del df_input['Country']
df_input.shape

(111993, 1516)

In [21]:
df_test = pd.concat((df_test,pd.get_dummies(df_test.Gender)),1)
del df_test['Gender']
df_test = pd.concat((df_test,pd.get_dummies(df_test.Profession)),1)
del df_test['Profession']
df_test = pd.concat((df_test,pd.get_dummies(df_test['University Degree'])),1)
del df_test['University Degree']
df_test = pd.concat((df_test,pd.get_dummies(df_test['Country'])),1)
del df_test['Country']
df_test.shape

(73230, 1498)

### Arrange Both the tables with same number of column's for applying Linear Regression.

In [22]:
del_val = list(set(list(df_input.columns) ) - set(list(df_test.columns) ) - set(['Income in EUR']))
del_val = np.asarray(del_val)
df_input.drop(del_val, axis=1, inplace=True)
df_input.shape

(111993, 1481)

In [23]:
del_val = list(set(list(df_test.columns) ) - set(list(df_input.columns) ) - set(['Income']))
del_val = np.asarray(del_val)
df_test.drop(del_val, axis=1, inplace=True)
df_test.shape

(73230, 1481)

### Applying Linear Regression
### Taking last 30,000 entries for test data and remaining for training purpose

In [24]:
from sklearn.linear_model import LinearRegression
X = df_input.loc[:, df_input.columns != 'Income in EUR'].values
y = df_input["Income in EUR"].values

test_size = 30000

X_train = X[:-test_size]
y_train = y[:-test_size]

X_test = X[-test_size:]
y_test = y[-test_size:]

clf = LinearRegression().fit(X_train, y_train)
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test))

0.7155966843114785


In [25]:
for X,y in list(zip(X_test, y_test))[:200]:
    print(clf.predict([X])[0], y)

8147.643511604052 29027.49544
199266.32885057805 136765.7695
409953.97731478093 356266.2085
475361.1944713625 400708.9818
58115.31711823819 60775.30183
26525.250171254855 32761.244039999998
13223.31678969739 37810.76192
64013.035078178626 51157.76369
687.7610303121619 6662.099034000001
-24239.97402601363 17450.05561
82862.2023665416 27387.2982
35862.907741616946 64724.876729999996
150487.78605980752 199764.8093
42395.371489706915 7928.719048999999
38886.11228091596 39949.23942
23450.448914249893 716212.0
132346.7341938722 38091.90994
-17470.661530424375 15314.80641
304935.0255366671 248592.5746
95972.2711017835 63535.84277
13750.320563744288 23929.54421
-5419.121902514715 22784.3852
154702.01739985822 87100.09671
39191.85363371251 814061.0
382522.6173969018 304184.5087
26333.060362409335 29845.551710000003
61980.297017059755 35315.76195
-51286.46783177974 25006.634169999998
161953.31096727727 169207.2925
109397.18390471814 49001.84602
62752.11283430038 54110.20522
95055.20492465375 366

### Writing in file

In [26]:
test_X = df_test.loc[:, df_test.columns != 'Income'].values
prediction_value = clf.predict(test_X)

In [27]:
df_test['Income'] = prediction_value
file = df_test[['Instance', 'Income']]

### File is Exported!.

In [28]:
export_csv = file.to_csv("tcd ml 2019-20 income prediction submission file.csv", index = None, header = True)