In [1]:
import pandas as pd
import numpy as np

#Data summarization tool
import pandas_profiling as pp

#Visualization tools
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

#Input missing values with MICE 
from impyute.imputation.cs import mice

## Import standard scaler and standardize all numeric variables(which are not target variables)
from sklearn.preprocessing import StandardScaler

#Warning suppression
import warnings
warnings.filterwarnings('ignore')

#Model tuning tools
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold, GridSearchCV 
from sklearn.model_selection import cross_val_score, learning_curve
from sklearn.metrics import mean_squared_error

#Regression Algorithm 
from sklearn.svm import SVR
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

import xgboost as xgb
import lightgbm as lgb

sns.set(style='white', context='notebook', palette='deep')

In [2]:
Train_df = pd.read_csv("train.csv")
Test_df = pd.read_csv("test.csv")
submission_sample_df = pd.read_csv("sample_submit.csv")

In [3]:
Train_df.sample(10)

Unnamed: 0,galactic year,galaxy,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,...,"Intergalactic Development Index (IDI), female","Intergalactic Development Index (IDI), male",Gender Development Index (GDI),"Intergalactic Development Index (IDI), female, Rank","Intergalactic Development Index (IDI), male, Rank",Adjusted net savings,"Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total",Private galaxy capital flows (% of GGP),Gender Inequality Index (GII),y
3108,1007012,Boötes II,0.946406,89.428369,40967.122628,0.949267,17.605656,13.612589,0.861028,0.971673,...,,,,,,,,,,0.102609
3711,1014049,Carina Dwarf (E206-G220),0.757331,68.584579,21939.60432,0.674618,13.938857,6.356593,0.724866,0.766743,...,0.628541,0.747742,0.923862,171.655011,166.992775,,4.377255,,,0.02672
3595,1012036,Boötes II,1.035831,84.013161,27722.890109,1.004407,17.731875,12.108194,1.075783,1.015537,...,1.013107,1.062431,1.072822,88.985249,105.5202,7.782782,4.5862,26.305598,0.505913,0.073909
3145,1007012,ESO 321-014[70],0.933335,80.920375,31265.773273,0.977339,17.657618,11.298041,0.848626,0.727172,...,,,,,,,,,,0.068741
3185,1008016,UGC 8508 (I Zw 060),1.080605,87.584084,37461.533249,1.080375,20.027302,11.755584,0.964757,0.82174,...,,,,,,,,,,0.141317
615,993012,Cas 1 (KK98 19),0.941279,79.054532,17374.965101,0.8273,15.970871,6.744599,0.878769,0.589002,...,,,,,,,,,,0.060547
2853,1005006,KUG 1210+301B (KK98 127),0.917028,76.135408,17403.8212,0.570047,11.337271,6.312458,0.62891,0.513015,...,0.655492,0.575018,0.90422,150.243274,150.806048,34.830664,5.3515,,0.852163,0.048112
3344,1010025,UGC 8651 (DDO 181),0.696114,63.04928,26450.180422,0.531363,7.350506,7.790751,0.636236,0.457493,...,0.540022,0.705865,0.993637,189.53852,189.457421,,7.797501,,,0.037607
1282,997002,IC 3104,0.662733,69.191241,13954.208697,0.635482,8.706361,6.189593,0.613685,0.464295,...,,,,,,,,,,0.044659
1955,1000000,"Cassiopeia Dwarf (Cas dSph, Andromeda VII)",1.116187,86.121512,57681.714317,1.090303,22.039843,12.641168,0.959668,0.986023,...,1.074215,1.03537,1.03221,65.40953,83.916873,,8.442742,,0.427929,0.17503


In [4]:
Test_df.sample(10)

Unnamed: 0,galactic year,galaxy,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,...,Current health expenditure (% of GGP),"Intergalactic Development Index (IDI), female","Intergalactic Development Index (IDI), male",Gender Development Index (GDI),"Intergalactic Development Index (IDI), female, Rank","Intergalactic Development Index (IDI), male, Rank",Adjusted net savings,"Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total",Private galaxy capital flows (% of GGP),Gender Inequality Index (GII)
343,1010025,Carina II,0.903945,82.352492,19355.865948,0.577838,13.780502,10.370882,0.697686,0.657446,...,4.537896,0.76226,0.737795,0.941611,175.495832,174.8149,46.430124,2.930465,7.925351,0.841932
342,1016064,"Leo II Dwarf (Leo B, DDO 93)",0.903199,81.593756,20175.796213,0.764089,14.143273,9.408389,0.785611,0.809592,...,9.219633,0.661529,0.887276,1.095748,139.555845,156.907708,12.380116,5.685938,3.491836,0.643408
461,1011030,Sagittarius Dwarf Irregular Galaxy (SagDIG),0.954096,80.192149,41645.093076,0.823286,15.763833,12.73536,0.85261,0.809721,...,6.681382,1.02889,0.898082,1.08801,107.784815,118.293462,20.497652,3.61334,21.474687,0.514933
725,1009020,Leo P,1.046803,91.522942,57931.374259,1.200405,19.611177,13.713159,1.177371,1.042899,...,,,,,,,,,,
151,1010025,Lacerta I (Andromeda XXXI),0.783617,77.744213,40638.754101,0.937423,17.499112,15.630745,0.919988,1.054276,...,6.770741,0.984523,0.927792,1.105348,93.489011,130.582108,16.407842,5.9857,10.629144,0.387153
160,1015056,Large Magellanic Cloud (LMC),0.790034,79.425296,21972.12293,0.831417,17.09594,6.481246,0.686543,0.597843,...,7.60823,0.726385,0.776223,0.967956,164.709313,143.100943,37.541783,,,0.637706
330,1013042,KUG 1210+301B (KK98 127),0.895482,79.269223,27165.777125,0.684036,14.838678,6.792488,0.700574,0.629785,...,8.816884,0.704991,0.769622,0.94008,178.236669,177.705542,54.523201,3.421385,,
266,1008016,Barnard's Galaxy (NGC 6822),0.858671,71.787106,31319.604901,0.968514,14.080219,8.478561,0.802189,0.7983,...,,,,,,,,,,
24,1008016,KKh 060,0.638031,64.127677,31205.573019,0.831811,17.587492,9.030719,0.685476,0.793508,...,,,,,,,,,,
272,1007012,ESO 383-087 (ISG 39),0.862022,70.802261,15214.443205,0.719456,9.785191,7.25757,0.645459,0.568823,...,,,,,,,,,,


In [5]:
submission_sample_df.sample(10)

Unnamed: 0,index,pred,opt_pred
342,342,0.08,60
800,800,0.08,10
78,78,0.08,100
367,367,0.08,60
729,729,0.08,20
224,224,0.08,70
501,501,0.08,40
779,779,0.08,20
304,304,0.08,60
716,716,0.08,20


In [6]:
# pp.ProfileReport(Train_df)

In [7]:
Train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3865 entries, 0 to 3864
Data columns (total 80 columns):
galactic year                                                                              3865 non-null int64
galaxy                                                                                     3865 non-null object
existence expectancy index                                                                 3864 non-null float64
existence expectancy at birth                                                              3864 non-null float64
Gross income per capita                                                                    3837 non-null float64
Income Index                                                                               3837 non-null float64
Expected years of education (galactic years)                                               3732 non-null float64
Mean years of education (galactic years)                                                   3502 non-null 

In [8]:
Train_df.describe()

Unnamed: 0,galactic year,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,"Intergalactic Development Index (IDI), Rank",...,"Intergalactic Development Index (IDI), female","Intergalactic Development Index (IDI), male",Gender Development Index (GDI),"Intergalactic Development Index (IDI), female, Rank","Intergalactic Development Index (IDI), male, Rank",Adjusted net savings,"Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total",Private galaxy capital flows (% of GGP),Gender Inequality Index (GII),y
count,3865.0,3864.0,3864.0,3837.0,3837.0,3732.0,3502.0,3474.0,3474.0,3432.0,...,916.0,915.0,914.0,893.0,892.0,912.0,941.0,874.0,844.0,3865.0
mean,1000709.0,0.872479,76.798111,31633.240872,0.825154,14.723296,10.283959,0.804246,0.7459,135.129178,...,0.823561,0.844209,1.008465,121.754797,120.873428,21.252922,6.443023,22.261474,0.600733,0.082773
std,6945.463,0.162367,10.461654,18736.378445,0.194055,3.612546,3.319948,0.176242,0.199795,52.449535,...,0.18578,0.159041,0.087299,46.269362,46.795666,14.258986,4.804873,34.342797,0.205785,0.063415
min,990025.0,0.22789,34.244062,-126.906522,0.292001,3.799663,1.928166,0.273684,0.189874,9.925906,...,0.305733,0.369519,0.465177,23.224603,16.215151,-76.741414,-1.192011,-735.186886,0.089092,0.013036
25%,995006.0,0.763027,69.961449,20169.118912,0.677131,12.592467,7.654169,0.671862,0.597746,92.262724,...,0.690707,0.731264,0.9658,84.090816,82.23255,15.001028,4.113472,17.227899,0.430332,0.047889
50%,1000000.0,0.907359,78.995101,26600.768195,0.8273,14.942913,10.385465,0.824758,0.761255,135.914318,...,0.83541,0.862773,1.029947,120.069916,121.057923,22.182571,5.309497,24.472557,0.62464,0.05782
75%,1006009.0,0.99276,84.558971,36898.631754,0.970295,17.123797,12.884752,0.939043,0.893505,175.301993,...,0.970365,0.961369,1.068481,158.579644,157.815625,29.134738,6.814577,31.748295,0.767404,0.087389
max,1015056.0,1.246908,100.210053,151072.683156,1.361883,26.955944,19.057648,1.232814,1.269625,278.786613,...,1.237661,1.182746,1.18123,232.720847,233.915373,61.903641,36.538462,95.941245,1.098439,0.683813


In [9]:
Train_df.isna().sum()

galactic year                                                                   0
galaxy                                                                          0
existence expectancy index                                                      1
existence expectancy at birth                                                   1
Gross income per capita                                                        28
                                                                             ... 
Adjusted net savings                                                         2953
Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total    2924
Private galaxy capital flows (% of GGP)                                      2991
Gender Inequality Index (GII)                                                3021
y                                                                               0
Length: 80, dtype: int64

In [10]:
-np.log(20)

-2.995732273553991

In [11]:
train_clean_df = Train_df.dropna(axis=1, thresh=1500)
train_clean_df.isna().sum()

galactic year                                                     0
galaxy                                                            0
existence expectancy index                                        1
existence expectancy at birth                                     1
Gross income per capita                                          28
Income Index                                                     28
Expected years of education (galactic years)                    133
Mean years of education (galactic years)                        363
Intergalactic Development Index (IDI)                           391
Education Index                                                 391
Intergalactic Development Index (IDI), Rank                     433
Population using at least basic drinking-water services (%)    1844
Population using at least basic sanitation services (%)        1850
Gross capital formation (% of GGP)                             2363
y                                               

In [12]:
clean_data_df = train_clean_df.dropna(axis=1, thresh=1932)
clean_data_df.isna().sum()

galactic year                                                     0
galaxy                                                            0
existence expectancy index                                        1
existence expectancy at birth                                     1
Gross income per capita                                          28
Income Index                                                     28
Expected years of education (galactic years)                    133
Mean years of education (galactic years)                        363
Intergalactic Development Index (IDI)                           391
Education Index                                                 391
Intergalactic Development Index (IDI), Rank                     433
Population using at least basic drinking-water services (%)    1844
Population using at least basic sanitation services (%)        1850
y                                                                 0
dtype: int64

In [13]:
# pp.ProfileReport(clean_data_df)

In [14]:
le = LabelEncoder()

In [15]:
encoded_galaxy = le.fit_transform(clean_data_df.galaxy)
encoded_galaxy.max()

180

In [16]:
mice_data_df = clean_data_df.drop(columns=['galaxy',
       'Population using at least basic drinking-water services (%)',
       'Population using at least basic sanitation services (%)'])
mice_data_df.sample(5)

Unnamed: 0,galactic year,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,"Intergalactic Development Index (IDI), Rank",y
2602,1004004,1.029299,86.922955,79692.842508,1.134474,22.745031,18.050698,1.11634,1.103276,46.475035,0.296728
214,991020,0.872802,77.835498,26628.104226,0.705495,12.045568,8.424414,0.810689,0.666019,128.521265,0.060906
857,994009,1.088493,83.170006,52143.105646,1.079667,15.502207,11.405274,0.915467,0.816477,73.413562,0.15421
2685,1004004,0.754786,68.231464,16744.574843,0.532862,12.361786,10.83616,0.574918,0.56762,201.333134,0.046615
2923,1006009,1.059927,88.481634,27913.112035,0.910309,19.399827,16.120011,1.075726,1.08128,58.008703,0.22861


In [17]:
scaler=StandardScaler()
feature_scaled = scaler.fit_transform(mice_data_df.drop(columns = ['y']))
feature_scaled= pd.DataFrame(data = feature_scaled, columns = mice_data_df.drop(columns = ['y']).columns)
# feature_scaled["y"] = mice_data_df['y']
feature_scaled.sample(10)

Unnamed: 0,galactic year,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,"Intergalactic Development Index (IDI), Rank"
2911,0.763178,-0.148182,0.285633,-0.293666,-0.467555,1.194477,-0.013376,0.354335,-0.032097,1.362479
2735,0.618749,0.16024,0.69283,0.189414,0.876605,0.964589,1.75741,0.620034,1.404818,-0.659738
2096,0.041895,-0.003428,-0.428774,-0.427118,0.414777,0.821124,0.511671,0.066785,0.901697,-0.089427
1399,-0.533807,-0.91078,-0.498673,-0.722034,-1.15032,-1.123604,-0.444077,-0.734452,-1.548399,0.658757
1472,-0.389954,-0.154689,0.684438,-0.512843,-0.087246,0.043722,1.544042,0.116845,0.564702,-0.128978
1989,0.041895,0.132912,0.526278,0.312755,0.995895,1.316728,1.145489,0.952741,1.031068,-0.905577
2035,0.041895,0.380163,0.524191,-0.987823,-0.532346,0.775133,0.880934,0.14403,0.537225,0.483414
2167,0.186036,0.530783,0.142041,0.034449,-0.158155,-0.045541,0.332136,-0.423277,0.404365,0.264172
69,-1.538478,-0.058174,-0.325605,-0.852249,0.060218,0.521014,,,,
1957,-0.102103,-0.20496,0.04837,-0.724454,-0.80783,-1.707966,-1.33215,-1.360896,-1.310514,0.881389


In [18]:
# start the MICE training
imputed_training=mice(feature_scaled)

In [19]:
imputed_training.sample(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
1733,-0.2461,0.724883,0.481439,-0.283139,0.685998,0.329734,-0.06196,0.861756,-0.230022,-0.711151
3060,0.763178,0.45867,1.020154,0.649723,0.965293,1.186012,1.29013,0.826079,2.163587,-0.917118
566,-1.108357,-1.54014,-1.494042,-0.943151,-0.752876,-1.115901,-1.080337,-1.274239,-1.16367,0.829046
1064,-0.821226,-3.002315,-2.888294,-0.812718,-1.953639,-1.974032,-1.224216,-2.283226,-1.640817,1.245824
1427,-0.533807,-0.516084,-0.278989,-0.465591,-0.526891,-1.138999,-0.08792,-0.551265,-0.016116,0.363318
239,-1.395201,-0.394295,-0.591594,-0.439413,-0.615564,-0.622551,-0.601833,-0.386167,-0.384915,-0.19864
1950,-0.102103,-0.026412,-0.347294,-0.69466,-0.105204,-0.749578,-0.495308,-0.458247,-0.589006,0.427949
2527,0.474463,0.500123,0.717104,-0.192973,0.435591,0.131851,0.282124,0.482891,0.281809,-0.038471
53,-1.538478,0.799929,0.235473,-0.146862,-0.316557,-0.675175,-0.285054,-0.630405,-1.05872,0.404972
1923,-0.102103,1.006597,0.732801,0.682152,0.952332,0.603849,0.071556,0.347164,0.765603,-0.847337


In [20]:
imputed_training.columns = mice_data_df.drop(columns = ['y']).columns


In [21]:
pp.ProfileReport(imputed_training)

HBox(children=(FloatProgress(value=0.0, description='variables', max=10.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='correlations', max=6.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='interactions [continuous]', style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='table', max=1.0, style=ProgressStyle(description_width='i…




HBox(children=(FloatProgress(value=0.0, description='missing', max=2.0, style=ProgressStyle(description_width=…









HBox(children=(FloatProgress(value=0.0, description='package', max=1.0, style=ProgressStyle(description_width=…




HBox(children=(FloatProgress(value=0.0, description='build report structure', max=1.0, style=ProgressStyle(des…






In [22]:
imputed_training.sample(10)

Unnamed: 0,galactic year,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,"Intergalactic Development Index (IDI), Rank"
1304,-0.533807,-1.316632,-2.091962,-0.607916,-1.674394,-2.329643,-1.858215,-2.036344,-2.099207,1.673883
1496,-0.389954,0.114344,0.988401,-0.346643,0.776605,0.251608,1.374386,0.817346,0.848064,-0.862144
684,-1.108357,-1.482495,-1.07586,-0.315663,-1.050645,-0.709536,-1.405001,-2.103633,-1.21701,0.867479
294,-1.395201,-1.03378,-0.342141,-0.351513,-1.001215,-1.721667,-1.777803,-1.317927,-1.953509,0.654774
2580,0.474463,0.996967,0.632216,0.219129,0.194114,0.391296,0.754466,0.725634,0.544182,-0.491789
3372,1.341472,1.13555,1.169597,0.675481,1.42837,1.165687,0.988442,1.637461,1.864284,-0.847785
430,-1.251779,-3.593252,-3.838196,-0.230973,-1.809192,-1.697001,-1.704737,-1.937827,-1.539646,1.120243
2219,0.186036,-1.398855,-1.270884,-0.510679,-1.798616,-1.941166,-1.138112,-2.511923,-2.483041,1.671954
3566,1.631051,1.602644,0.914224,2.404334,1.352395,1.79108,1.59897,1.723318,1.392241,-1.802788
2095,0.041895,-0.22624,0.785984,-0.037957,0.675852,-0.485624,-0.794553,0.471218,-0.583748,0.546605


In [23]:
train_df = imputed_training
y = mice_data_df['y']

In [24]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.2, shuffle=True)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3092, 10) (773, 10) (3092,) (773,)


In [25]:
rs = 3
kfold = KFold(n_splits=10, random_state=rs, shuffle=True)

regressors = []
regressors.append(SVR())
regressors.append(GradientBoostingRegressor(random_state=rs))
regressors.append(ExtraTreesRegressor(n_estimators=rs))
regressors.append(RandomForestRegressor(random_state=rs))
regressors.append(xgb.XGBRegressor(random_state=rs, objective="reg:squarederror"))
regressors.append(lgb.LGBMRegressor(random_state=rs))

cv_results = []
for regressor in regressors:     #scores to be minimised are negated (neg)
    cv_results.append(np.sqrt(abs(cross_val_score(regressor, X_train, y=y_train, scoring='neg_mean_squared_error', cv=kfold))))

cv_means = []
cv_stds = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_stds.append(cv_result.std())
    
cv_res = pd.DataFrame({ 
    "Algorithm": ["SVR", "GBR", "EXR", "RFR", "XGBR", "LGBM"],
    "CrossValMeans": cv_means, "CrossValErrors": cv_stds
                       })
cv_res = cv_res.sort_values("CrossValMeans", ascending=True)
print(cv_res)

  Algorithm  CrossValMeans  CrossValErrors
1       GBR       0.031314        0.004459
4      XGBR       0.031542        0.004957
5      LGBM       0.031676        0.005200
3       RFR       0.032893        0.005346
2       EXR       0.034782        0.004482
0       SVR       0.056283        0.003973


In [26]:
gbr = GradientBoostingRegressor(random_state=rs)
xgbr = xgb.XGBRegressor(random_state=rs, objective="reg:squarederror")
lgbm = lgb.LGBMRegressor(random_state=rs)
rfr = RandomForestRegressor(random_state=rs)
exr = ExtraTreesRegressor(n_estimators=rs)
svr = SVR()


In [27]:
trainedmodelgbr = gbr.fit(X_train,y_train)
trainedmodelxgbr = xgbr.fit(X_train,y_train)
trainedmodellgbm = lgbm.fit(X_train,y_train)
trainedmodelrfr = rfr.fit(X_train,y_train)
trainedmodelexr = exr.fit(X_train,y_train)
trainedmodelsvr = svr.fit(X_train,y_train)

In [28]:
y_pred = trainedmodelgbr.predict(X_test)
print("GBR", mean_squared_error(y_pred, y_test))
y_pred = trainedmodelxgbr.predict(X_test)
print("XGBR", mean_squared_error(y_pred, y_test))
y_pred = trainedmodellgbm.predict(X_test)
print("LGBM", mean_squared_error(y_pred, y_test))
y_pred = trainedmodelrfr.predict(X_test)
print("RFR", mean_squared_error(y_pred, y_test))
y_pred = trainedmodelexr.predict(X_test)
print("EXR", mean_squared_error(y_pred, y_test))
y_pred = trainedmodelsvr.predict(X_test)
print("SVR", mean_squared_error(y_pred, y_test))

GBR 0.0011354837853015508
XGBR 0.0011279754723409587
LGBM 0.0009514173896435464
RFR 0.0010145785328526677
EXR 0.0015573116511727824
SVR 0.003244401164749733


In [29]:
testData = Test_df[imputed_training.columns]
testData.sample(5)

Unnamed: 0,galactic year,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,"Intergalactic Development Index (IDI), Rank"
853,1015056,1.131963,89.41751,66881.425967,1.156961,20.638518,13.677196,0.989646,1.068227,59.899568
514,1012036,0.976022,84.53455,67130.941116,1.129016,20.691698,17.854808,1.03207,1.119825,81.118351
476,1007012,0.960905,81.032314,39866.366342,0.981671,16.245241,12.943035,0.95249,0.847697,111.080402
49,1015056,0.671839,68.143825,44345.751126,1.041028,11.042017,8.75492,0.755132,0.642207,210.817584
881,1009020,1.173076,89.891307,61968.904515,0.956639,18.9751,14.62396,1.00986,1.05184,74.39243


In [33]:
test_feature_scaled = scaler.fit_transform(testData)
test_feature_scaled= mice(pd.DataFrame(data = test_feature_scaled, columns = testData.columns))
test_feature_scaled.columns = testData.columns
test_feature_scaled.sample(10)

Unnamed: 0,galactic year,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,"Intergalactic Development Index (IDI), Rank"
592,-0.860345,0.548999,0.639084,0.036451,-0.019847,0.704812,0.848149,-0.468643,0.561434,-0.543654
516,-1.557595,0.395353,0.642037,-0.941435,-1.187647,-0.392531,-0.647371,-1.211584,-0.616403,0.322303
850,0.536238,1.50255,0.911029,1.1691,0.924612,2.000658,0.837435,0.870873,1.154526,-1.741312
136,0.536238,-1.117553,-1.03918,-1.017693,-1.49599,-0.05141,-0.605754,-1.143871,-0.692867,1.424952
252,1.235572,-0.542069,-1.654514,-1.157586,-1.957643,-0.97948,-1.369307,-1.260395,-0.687592,1.481054
754,-1.557595,1.032002,0.054148,-0.406722,-0.396201,0.335051,0.291101,0.586371,0.198367,0.058103
31,-1.557595,-2.039071,-1.284307,-1.022532,-1.078621,-1.573648,-2.312443,-1.833577,-2.377801,1.755884
163,1.585585,-0.980945,-1.071438,-0.638966,-1.197556,-1.034188,-1.41485,-1.409192,-1.36485,1.771669
609,1.235572,0.584495,0.574089,-0.622247,-0.919664,-0.813714,-0.587227,-0.736806,-0.540765,0.854937
384,-0.162401,-0.006327,-0.591558,-0.813931,-1.698916,-0.426741,-1.686448,-0.274233,-1.540599,1.301531


In [34]:
y_pred1 = trainedmodelgbr.predict(test_feature_scaled)
y_pred2 = trainedmodelxgbr.predict(test_feature_scaled)
y_pred3 = trainedmodellgbm.predict(test_feature_scaled)
y_pred4 = trainedmodelrfr.predict(test_feature_scaled)
y_pred5 = trainedmodelexr.predict(test_feature_scaled)
y_pred6 = trainedmodelsvr.predict(test_feature_scaled)

In [59]:
Train_df.loc[Train_df["galaxy" ] == "Leo P"]

Unnamed: 0,galactic year,galaxy,existence expectancy index,existence expectancy at birth,Gross income per capita,Income Index,Expected years of education (galactic years),Mean years of education (galactic years),Intergalactic Development Index (IDI),Education Index,...,"Intergalactic Development Index (IDI), female","Intergalactic Development Index (IDI), male",Gender Development Index (GDI),"Intergalactic Development Index (IDI), female, Rank","Intergalactic Development Index (IDI), male, Rank",Adjusted net savings,"Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total",Private galaxy capital flows (% of GGP),Gender Inequality Index (GII),y
12,990025,Leo P,1.068957,90.854092,48579.590188,1.082535,17.459732,13.422714,1.053766,0.964813,...,,,,,,35.631013,5.686449,28.920114,,0.178066
348,991020,Leo P,0.934927,84.373655,53556.906106,1.120308,16.722023,14.236939,0.916615,0.769212,...,,,,,,,,,,0.17894
460,992016,Leo P,1.000826,93.955119,55717.265289,1.010462,18.996268,12.039128,0.978445,0.972015,...,,,,,,,,,,0.179545
584,993012,Leo P,1.018948,85.078377,50753.836514,1.120491,16.659388,14.298606,1.044159,0.937519,...,,,,,,,,,,0.180208
878,994009,Leo P,1.048111,90.214493,52670.656452,1.053284,22.179132,11.700195,1.054855,0.944371,...,,,,,,,,,,0.186204
1030,995006,Leo P,0.958981,87.054244,52985.767198,1.004742,20.684727,13.710295,1.077371,0.942649,...,1.124896,1.01348,1.126474,54.444158,49.907317,24.511635,4.723043,38.36637,0.368338,0.184842
1251,996004,Leo P,1.15025,87.438512,39947.642346,1.044112,21.254146,14.122569,1.038296,0.923379,...,,,,,,,,,,0.189389
1308,997002,Leo P,1.105523,89.607899,62258.655175,1.178641,20.345008,13.895634,1.045173,0.951003,...,,,,,,,,,,0.188625
1464,998001,Leo P,1.074646,89.644665,64748.027781,1.124437,20.236569,12.854638,0.940422,1.021026,...,,,,,,,,,,0.188626
1734,999000,Leo P,1.048917,88.828308,64334.784697,1.069713,19.679865,15.758142,0.96304,1.11235,...,,,,,,,,,,0.188526


In [30]:
Test_df.isna().sum()

galactic year                                                                  0
galaxy                                                                         0
existence expectancy index                                                     5
existence expectancy at birth                                                  5
Gross income per capita                                                        5
                                                                            ... 
Intergalactic Development Index (IDI), male, Rank                            341
Adjusted net savings                                                         371
Creature Immunodeficiency Disease prevalence, adult (% ages 15-49), total    408
Private galaxy capital flows (% of GGP)                                      354
Gender Inequality Index (GII)                                                361
Length: 79, dtype: int64

In [32]:
Test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 890 entries, 0 to 889
Data columns (total 79 columns):
galactic year                                                                              890 non-null int64
galaxy                                                                                     890 non-null object
existence expectancy index                                                                 885 non-null float64
existence expectancy at birth                                                              885 non-null float64
Gross income per capita                                                                    885 non-null float64
Income Index                                                                               885 non-null float64
Expected years of education (galactic years)                                               885 non-null float64
Mean years of education (galactic years)                                                   882 non-null float64
In

In [110]:
def allocate_energy(df):
    y_pred1= df["pred"]
    existence_index =  df["existence expectancy index"]
#     print("Ypred", y_pred1)
    extra_energy = 50000
    max_energy_allocation = 100
    min_energy_allocation = 0
    low_existence_index_mark = 0.7
#     if existence_index < low_existence_index_mark:
#         extra_energy = 0.1*50000
# #         print("Extra Energy ", extra_energy)
#     else:
#         extra_energy = 0.9*50000

    index_increase_potential = -(np.log(y_pred1+0.01)+3)

    index_increase_likelihood = (extra_energy * index_increase_potential **2) / 1000
#     print("Index increase likelihood ",index_increase_likelihood)


    if index_increase_likelihood> max_energy_allocation:
        galaxy_energy_allocation = 100
    elif index_increase_likelihood < min_energy_allocation:
        galaxy_energy_allocation = 0
    else:
        galaxy_energy_allocation = index_increase_likelihood

#     print("Galaxy energy allocation",galaxy_energy_allocation)
    return galaxy_energy_allocation

In [111]:
y = 0
t_df = Test_df
for x in [y_pred1,y_pred2,y_pred3,y_pred4,y_pred5,y_pred6]:
    t_df["pred"] = x  
#     print(t_df.sample(5))
    sub_df = pd.DataFrame(data=x,columns=["pred"])
    
    sub_df["opt_pred"] = t_df.apply(allocate_energy, axis = 1)
#     print(sub_df.sample(10))
#     break
    sub_df.to_csv(path_or_buf="submission"+str(y),index= True, index_label= "index")
    y+=1

In [91]:
sub_df.sample(10)

Unnamed: 0,pred,opt_pred
23,0.040605,17.902321
841,0.175857,100.0
581,0.057349,100.0
774,0.183781,100.0
730,0.239369,100.0
247,0.05046,100.0
560,0.114806,100.0
557,0.087353,100.0
543,0.070931,100.0
197,0.04769,100.0


In [76]:
df = pd.DataFrame([{'c1':10, 'c2':100}, {'c1':11,'c2':110}, {'c1':12,'c2':120}])
df

Unnamed: 0,c1,c2
0,10,100
1,11,110
2,12,120


In [77]:
for index, row in df.iterrows():
    row["c3"] = row['c1']+row['c2']
    print(df)
    print(row['c1'],row['c2'], row["c3"])

   c1   c2
0  10  100
1  11  110
2  12  120
10 100 110
   c1   c2
0  10  100
1  11  110
2  12  120
11 110 121
   c1   c2
0  10  100
1  11  110
2  12  120
12 120 132


In [107]:
-(np.log(0.5+0.01)+3)

-2.3266554467362344

In [103]:
Index

3.6733445532637656