In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pystan as ps

from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv('/kaggle/input/dont-overfit-ii/train.csv')
train.head()

In [None]:
train.isnull().any().sum()
#no Null values in dataframe

In [None]:
sum(train.dtypes=='object')
#all data are numerical

In [None]:
train['target'].unique()

In [None]:
train.info()

# Splitting Data

In [None]:
X=train.drop(columns=['id','target'],axis=1)
y=train['target']

In [None]:
X_train,X_valid,y_train,y_valid=train_test_split(X,y,random_state=42)

# Trying Linear Regression Model


In [None]:
model_linreg = LinearRegression()

# linear regression model fit
model_linreg.fit(X_train, y_train)

# linear regression model prediction
model_linreg_ypredict = model_linreg.predict(X_valid)

# linear regression model metrics
model_linreg_rocaucscore = roc_auc_score(y_valid, model_linreg_ypredict)
model_linreg_cvscores = cross_val_score(model_linreg, X, y, cv=20, scoring='roc_auc')
print('linear regression\n  roc auc score: %0.4f, cross validation score: %0.4f (+/- %0.4f)' 
      %(model_linreg_rocaucscore, model_linreg_cvscores.mean(), 2 * model_linreg_cvscores.std()))

### Linear regression gives 0.627 on LB , so it seems Overfitting

## After Seeing some kernels use Pystan model so we 'll try it.

In [None]:
train.pop('id')                                                                                      
target = train.pop('target').astype(int)                                                             
                                                                                                     
test = pd.read_csv('/kaggle/input/dont-overfit-ii/test.csv')                                                              
ids = test.pop('id')  

In [None]:
code = """                                                                                           
data {                                                                                               
  int N; //the number of training observations                                                       
  int N2; //the number of test observations                                                          
  int K; //the number of features                                                                    
  int y[N]; //the response                                                                           
  matrix[N,K] X; //the model matrix                                                                  
  matrix[N2,K] new_X; //the matrix for the predicted values                                          
}                                                                                                    
parameters {                                                                                         
  real alpha;                                                                                        
  vector[K] beta; //the regression parameters                                                        
}                                                                                                    
transformed parameters {                                                                             
  vector[N] linpred;                                                                                 
  linpred = alpha+X*beta;                                                                            
}                                                                                                    
model {                                                                                              
  alpha ~ cauchy(0,10); //prior for the intercept following Gelman 2008                              
                                                                                                     
  for(i in 1:K)                                                                                      
    beta[i] ~ student_t(1, 0, 0.03);                                                                 
                                                                                                     
  y ~ bernoulli_logit(linpred);                                                                      
}                                                                                                    
generated quantities {                                                                               
  vector[N2] y_pred;                                                                                 
  y_pred = alpha+new_X*beta; //the y values predicted by the model                                   
}                                                                                                    
"""               

In [None]:
data = {                                                                                             
    'N': 250,                                                                                        
    'N2': 19750,                                                                                     
    'K': 300,                                                                                        
    'y': target,                                                                                     
    'X': train,                                                                                      
    'new_X': test,                                                                                   
}                                                                                                    
                                                                                                     
sm = ps.StanModel(model_code=code)                                                               
fit = sm.sampling(data=data, seed=1234)                                                              
ex = fit.extract(permuted=True)                                                                      
target = np.mean(ex['y_pred'], axis=0)                                                               
df = pd.DataFrame({'id': ids, 'target': target})                                                     
df[['id', 'target']].to_csv('submission.csv', index=False) 