In [1]:
# importing libraries

import pandas as pd
import numpy as np
from sklearn import model_selection as msl
from sklearn.linear_model import LinearRegression

## Prepare Data

In [2]:
# importing data

train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# view some sample
train_data.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.0,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.35585,0,12.234987


In [3]:
# check for null entries in train_data

train_data.isnull().sum()

id           0
breath_id    0
R            0
C            0
time_step    0
u_in         0
u_out        0
pressure     0
dtype: int64

In [4]:
# samples from test_data
test_data.head()

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out
0,1,0,5,20,0.0,0.0,0
1,2,0,5,20,0.031904,7.515046,0
2,3,0,5,20,0.063827,14.651675,0
3,4,0,5,20,0.095751,21.23061,0
4,5,0,5,20,0.127644,26.320956,0


In [5]:
#check for null entries

test_data.isnull().sum()

id           0
breath_id    0
R            0
C            0
time_step    0
u_in         0
u_out        0
dtype: int64

## Feature Engineering

In [6]:
# this dont needed to be included as this we seen from some discussions 

train_data['u_in_cumsum'] = (train_data['u_in']).groupby(train_data['breath_id']).cumsum()

test_data['u_in_cumsum'] = (test_data['u_in']).groupby(test_data['breath_id']).cumsum()

## Clean Data

In [7]:
# seperate input training data and output training data

features = ['R','C','u_in_cumsum','u_out']

x_train = train_data[features]
x_test = test_data[features]

output_pressure = train_data[['pressure']]

x_train.head()

Unnamed: 0,R,C,u_in_cumsum,u_out
0,20,50,0.083334,0
1,20,50,18.466375,0
2,20,50,40.975653,0
3,20,50,63.784476,0
4,20,50,89.140326,0


In [8]:
# lets see what the output really looks
output_pressure.head()

Unnamed: 0,pressure
0,5.837492
1,5.907794
2,7.876254
3,11.742872
4,12.234987


## Split Data for Validation and Training

In [86]:
# Now we should split training data into ratio of 80 : 20 
# 80 for training and then the rest 20 for validatiion of model outputs

#x_train_data, x_validatn_data, y_train_val, y_validatn_val = msl.train_test_split(x_train,output_pressure,train_size = 0.05,test_size=0.01)
x_train_data, x_validatn_data, y_train_val, y_validatn_val = msl.train_test_split(x_train,output_pressure)

x_train_data.describe()

Unnamed: 0,R,C,u_in_cumsum,u_out
count,4527000.0,4527000.0,4527000.0,4527000.0
mean,27.03205,26.0805,406.1654,0.6204672
std,19.59644,17.15202,414.0726,0.4852708
min,5.0,10.0,0.0,0.0
25%,5.0,10.0,134.812,0.0
50%,20.0,20.0,275.4686,1.0
75%,50.0,50.0,514.7667,1.0
max,50.0,50.0,2718.981,1.0


## Prepare and Define Models 

In [61]:
# importing model libraries

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.naive_bayes import GaussianNB as GNB

### Linear Regression Method

In [62]:
# lets apply different algorithms to compare their results 
# Like we can try Linear Regression, RandomForests and Neural Networks Approch

# first we try linear regression
algo1 = LinearRegression()
algo1.fit(x_train_data,y_train_val)



LinearRegression()

In [63]:
# lets see the score of this algorithm

score1 = algo1.score(x_validatn_data,y_validatn_val)
score1

0.4664997752031804

## Neural networks Method

In [64]:
# the best method is usually the Neural Networks 

#lets import tensorflow libraries
import tensorflow as tf
from tensorflow.keras import layers

In [70]:
# define model

neural_model = tf.keras.Sequential()

neural_model.add(layers.Dense(128, activation= 'relu' , name = 'layer1' ) )
neural_model.add(layers.Dense(32, activation = 'relu', name = 'layer2' ) )
neural_model.add(layers.Dense(32, activation = 'relu', name = 'layer3' ) )
neural_model.add(layers.Dense(8, activation = 'relu', name = 'layer4' ) )
neural_model.add(layers.Dense(1, activation = 'relu', name = 'final_layer' ) )

neural_model.compile(loss='mse', optimizer='adam',metrics = ['accuracy'])


### Epochs and Batch Size 

In [75]:
epochs = 4
batch_size = 128

In [76]:
# fitting our model on data 
neural_model.fit( x = x_train_data.values ,y = y_train_val.values , epochs = epochs, batch_size = batch_size )

Train on 301800 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7face7e19450>

In [77]:
# lets see the score of second algorithm

y_validatn_preds = neural_model.predict(x_validatn_data.values)

## Checking Score of Our Model

In [78]:
# importing sklearn libraries 

from sklearn.metrics import accuracy_score


y_true = np.array(y_validatn_val).astype('int64')
y_preds = np.array(y_validatn_preds).astype('int64')

accuracy_score(y_true,y_preds)

0.26083499005964217

## Predicting & Exporting values 

In [89]:
# whichever algo has more score predict according to that

preds = algo3.predict(x_test)

preds = preds.reshape(-1)
preds

array([6.11206043, 6.00888391, 6.93534262, ..., 7.00295903, 7.01632689,
       7.07816781])

In [90]:
ans = {
    'id':test_data['id'],
    'pressure':preds
}
ans = pd.DataFrame(ans)
ans.to_csv('pred.csv',index = False)

## Light GBM Method 

In [80]:
# build the lightgbm model

import lightgbm as lgb

In [87]:
algo3 = lgb.LGBMRegressor()
algo3.fit(x_train_data,y_train_val)

LGBMRegressor()

In [88]:
# lets see the score of this algorithm

score3 = algo3.score(x_validatn_data,y_validatn_val)
score3

0.7751882121930562