# Raul

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score
from rdkit import Chem



In [2]:
"""
Read in train and test as Pandas DataFrames
"""
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [None]:
df_train.head()

In [None]:
df_test.head()

In [3]:
#store gap values
Y_train = df_train.gap.values
#row where testing examples start
test_idx = df_train.shape[0]
#delete 'Id' column
df_test = df_test.drop(['Id'], axis=1)
#delete 'gap' column
df_train = df_train.drop(['gap'], axis=1)

In [4]:
#DataFrame with all train and test examples so we can more easily apply feature engineering on
df_all = pd.concat((df_train, df_test), axis=0)
df_all.head()

Unnamed: 0,smiles,feat_001,feat_002,feat_003,feat_004,feat_005,feat_006,feat_007,feat_008,feat_009,...,feat_247,feat_248,feat_249,feat_250,feat_251,feat_252,feat_253,feat_254,feat_255,feat_256
0,c1ccc(o1)-c1ccc(s1)-c1cnc(-c2scc3[se]ccc23)c2n...,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,C1=CC=C(C1)c1cc2ncc3c4[SiH2]C=Cc4ncc3c2c2=C[Si...,1,0,0,0,1,0,1,0,0,...,0,1,0,0,1,0,0,0,0,0
2,[nH]1c-2c([SiH2]c3cc(-c4scc5C=CCc45)c4nsnc4c-2...,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
3,[nH]1c2-c3occc3Cc2c2c1cc(-c1cccc3=C[SiH2]C=c13...,1,0,0,0,1,1,1,0,0,...,0,1,0,0,0,1,0,0,0,0
4,c1cnc2c3oc4cc(-c5ncncn5)c5nsnc5c4c3c3cocc3c2c1,0,0,0,0,1,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0


# Feature Engineering! 
Here we use RDKit to find valuable features for each smiles string 


In [5]:
from rdkit.Chem import rdMolDescriptors

def apply_rdkit(s, func):
    m = Chem.MolFromSmiles(s)
    return func(m)

In [None]:
num_rings = np.vstack(df_all.smiles.astype(str).apply(lambda s: apply_rdkit(s, rdMolDescriptors.CalcNumRings)))
df_all['num_rings'] = pd.DataFrame(num_rings)

In [None]:
num_rot_bonds = np.vstack(df_all.smiles.astype(str).apply(lambda s: apply_rdkit(s, 
                                                                    rdMolDescriptors.CalcNumRotatableBonds)))
df_all['num_rot_bonds'] = pd.DataFrame(num_rot_bonds)

In [43]:
#Drop the 'smiles' column
# df_all = df_all.drop(['smiles'], axis=1)
df_all = df_all.loc[:, (df_all != 0).any(axis=0)]
vals = df_all.values
X_train = vals[:test_idx]
X_test = vals[test_idx:]
print "Train features:", X_train.shape
print "Train gap:", Y_train.shape
print "Test features:", X_test.shape

Train features: (1000000, 256)
Train gap: (1000000,)
Test features: (824230, 256)


In [None]:
print df_all.head()

In [46]:
RCV = RidgeCV(alphas=[0.1, 1.0, 10.0])
RCV.fit(X_train, Y_train)  
RCV_pred = RCV.predict(X_test)

RCV_scores = cross_val_score(RCV, X_train, Y_train)

In [47]:
RCV_scores.mean()
write_to_file("fourfeaturesRF.csv", RF_pred)

0.4609970136412766

In [24]:
LR = LinearRegression()
LR.fit(X_train, Y_train)
LR_pred = LR.predict(X_test)

LR_scores = cross_val_score(LR, X_train, Y_train)
LR_scores.mean() 

0.52633855773060512

In [21]:
RF = RandomForestRegressor()
RF.fit(X_train, Y_train)
RF_pred = RF.predict(X_test)

RF_scores = cross_val_score(RF, X_train, Y_train)
RF_scores.mean()

0.63827262767536153

In [48]:
def write_to_file(filename, predictions):
    with open(filename, "w") as f:
        f.write("Id,Prediction\n")
        for i,p in enumerate(predictions):
            f.write(str(i+1) + "," + str(p) + "\n")

In [25]:
write_to_file("fourfeaturesRF.csv", RF_pred)

In [26]:
df_rf = pd.read_csv("fourfeaturesRF.csv")


In [34]:
print RF_scores.mean()

0.638272627675


In [15]:
print df_rf.shape

(824230, 2)


# Using Gradient Boosting Regressors 
We now extend our formulation to an ensemble method that uses many estimators 

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error


In [None]:
GB = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, 
                                 max_depth=1, random_state=0, loss='ls')
GB.fit(X_train, Y_train)
GB_pred = GB.predict(X_test)

GB_scores = cross_val_score(GB, X_train, Y_train)
GB_scores.mean()

In [None]:
write_to_file("gbprediction.csv", GB_pred)

In [None]:
df_gb = pd.read_csv("gbprediction.csv")

In [None]:
print df_gb.shape

# Support Vector Regression

In [None]:
from sklearn import svm
s = svm.SVR()
s.fit(X_train, Y_train)
s_pred = s.predict(X_test)

s_scores = cross_val_score(s, X_train, Y_train)
s_scores.mean()

# Andy 