In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVR
#from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

%matplotlib inline

In [2]:
dfCutoffData = pd.read_csv("2018_measures_around_cutoff.csv")

In [3]:
dfCutoffData.dtypes

profile                 float64
number                  float64
depth                   float64
temperature             float64
diff_temp               float64
vgrad                   float64
min_10_diff_temp        float64
max_10_diff_temp        float64
var_10_diff_temp        float64
std_10_diff_temp        float64
avg_10_diff_temp        float64
norm_10_diff_temp       float64
min_20_diff_temp        float64
max_20_diff_temp        float64
var_20_diff_temp        float64
std_20_diff_temp        float64
avg_20_diff_temp        float64
norm_20_diff_temp       float64
min_10_vgrad            float64
max_10_vgrad            float64
var_10_vgrad            float64
std_10_vgrad            float64
avg_10_vgrad            float64
norm_10_vgrad           float64
min_20_vgrad            float64
max_20_vgrad            float64
var_20_vgrad            float64
std_20_vgrad            float64
avg_20_vgrad            float64
norm_20_vgrad           float64
last_point              float64
away_fro

In [21]:
def lin_regr(x, y, attributeTarget, attributePredictor):
    x_train, x_test, y_train, y_test = (
        train_test_split(x, y, test_size=0.3, random_state=4))
    
    regr = linear_model.LinearRegression()
    if ((x_train.ndim < 2) or (x_test.ndim < 2)):
        # single column
        regr.fit(x_train.values.reshape(-1,1), y_train.values)
        y_pred_regr = regr.predict(x_test.values.reshape(-1,1))
    else:
        #multiple columns process
        regr.fit(x_train.values, y_train.values)
        y_pred_regr = regr.predict(x_test.values)
        
    try:
        coefficient = ("{:.2f}".format(regr.coef_))
    except Exception as e:
        coefficient = "NULL"

    try:
        absoluteError = ("{:.2f}".format(
            mean_absolute_error(y_test.values, y_pred_regr)))
    except Exception as e:
        absoluteError = "NULL"

    try:
        squaredError = ("{:.2f}".format(
            mean_squared_error(y_test.values, y_pred_regr)))
    except Exception as e:
        squaredError = "NULL"

    try:
        correlation = ("{:.2f}".format(
            np.sqrt(r2_score(y_test.values, y_pred_regr))))
    except Exception as e:
        correlation = "NULL"

    try:
        variance = ("{:.2f}".format(r2_score(y_test.values, y_pred_regr)))
    except Exception as e:
        variance = "NULL"

    print("\"{0}\",\"{1}\",{2},{3},{4},{5},{6}"
          .format(attributeTarget, attributePredictor, coefficient, 
                  absoluteError, squaredError, correlation, variance))

In [5]:
target = dfCutoffData['last_point']

In [6]:
def pnt_position(row):
    if row['away_from_last_point'] > 0:
        return 1
    elif row['away_from_last_point'] < 0:
        return 2
    else:
        return 0

In [7]:
dfCutoffData['pnt_position'] = dfCutoffData.apply(pnt_position, axis=1)

In [12]:
pd.set_option('display.max_columns', None) #display all column results
dfCutoffData.groupby('pnt_position').mean()

Unnamed: 0_level_0,profile,number,depth,temperature,diff_temp,vgrad,min_10_diff_temp,max_10_diff_temp,var_10_diff_temp,std_10_diff_temp,avg_10_diff_temp,norm_10_diff_temp,min_20_diff_temp,max_20_diff_temp,var_20_diff_temp,std_20_diff_temp,avg_20_diff_temp,norm_20_diff_temp,min_10_vgrad,max_10_vgrad,var_10_vgrad,std_10_vgrad,avg_10_vgrad,norm_10_vgrad,min_20_vgrad,max_20_vgrad,var_20_vgrad,std_20_vgrad,avg_20_vgrad,norm_20_vgrad,last_point,away_from_last_point
pnt_position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
0,36151.11668,1391.611529,886.405908,6.321824,-0.649206,1.074725,-0.746442,0.109602,0.298545,0.266853,-0.0668,2.717452,-0.747054,0.114136,0.149829,0.190255,-0.031374,3.984693,-0.180686,1.234804,0.815878,0.441182,0.110636,2.717563,-0.187999,1.235781,0.409413,0.314477,0.051994,3.985274,1.0,0.0
1,36151.11668,1396.611529,889.437002,17.629276,-1.983387,3.284568,-5.848315,0.326515,6.606893,2.068952,-1.332123,1.012859,-5.848689,0.365622,3.833182,1.608324,-0.664495,1.304625,-0.537384,9.680846,18.087625,3.423938,2.205791,1.012869,-0.602116,9.681439,10.496493,2.661738,1.100357,1.304879,0.0,5.0
2,36151.11668,1381.111529,880.035848,6.309933,0.001306,-0.00208,-0.009377,0.016366,0.00019,0.008571,0.003739,0.855716,-0.012572,0.02041,0.000207,0.009054,0.004065,0.883712,-0.026968,0.015575,0.000498,0.014053,-0.00612,inf,-0.033467,0.020846,0.000533,0.014758,-0.006585,0.889811,0.0,-10.5


In [13]:
dfCutoffData.groupby('pnt_position').std()

Unnamed: 0_level_0,profile,number,depth,temperature,diff_temp,vgrad,min_10_diff_temp,max_10_diff_temp,var_10_diff_temp,std_10_diff_temp,avg_10_diff_temp,norm_10_diff_temp,min_20_diff_temp,max_20_diff_temp,var_20_diff_temp,std_20_diff_temp,avg_20_diff_temp,norm_20_diff_temp,min_10_vgrad,max_10_vgrad,var_10_vgrad,std_10_vgrad,avg_10_vgrad,norm_10_vgrad,min_20_vgrad,max_20_vgrad,var_20_vgrad,std_20_vgrad,avg_20_vgrad,norm_20_vgrad,last_point,away_from_last_point
pnt_position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
0,7731.557602,233.051804,146.828963,2.818421,1.602488,2.648539,1.22945,0.953621,2.692541,0.476859,0.162401,0.394651,1.229111,0.953897,1.346005,0.337151,0.08173,0.697982,1.572869,2.034775,7.374137,0.788267,0.268412,0.393435,1.573133,2.034232,3.686382,0.5573,0.135007,0.696639,0.0,0.0
1,7730.876842,233.045588,146.778489,8.83091,3.079328,5.096661,4.28825,2.615009,13.733743,1.525252,0.944991,0.921836,4.287748,2.607581,6.872912,1.116471,0.473299,1.50338,4.311222,7.099498,37.56906,2.522779,1.564267,0.921836,4.298864,7.098702,18.804134,1.847077,0.7834,1.503423,0.0,2.582017
2,7730.830047,233.101206,146.953217,2.851729,0.024143,0.038554,0.020811,0.033604,0.008205,0.013233,0.013202,0.680706,0.022981,0.042647,0.006415,0.013716,0.012285,0.823377,0.050847,0.034084,0.018448,0.02036,0.020288,,0.064361,0.037459,0.014421,0.020924,0.018742,0.817761,0.0,5.76631


In [14]:
pd.reset_option('max_columns')

In [15]:
test1 = dfCutoffData[['std_20_diff_temp','std_20_vgrad']]

In [22]:
lin_regr(test1, target, target.name, test1.columns)

"last_point","Index(['std_20_diff_temp', 'std_20_vgrad'], dtype='object')",NULL,0.06,0.03,0.06,0.00


In [23]:
test2 = dfCutoffData['std_20_vgrad']
lin_regr(test2, target, target.name, test2.name)

"last_point","std_20_vgrad",NULL,0.06,0.03,0.06,0.00


In [24]:
testAll = ['diff_temp', 'vgrad', 'min_10_diff_temp', 'max_10_diff_temp',
           'var_10_diff_temp','std_10_diff_temp','avg_10_diff_temp',
           'norm_10_diff_temp','min_20_diff_temp','max_20_diff_temp',
           'var_20_diff_temp','std_20_diff_temp','avg_20_diff_temp',
           'norm_20_diff_temp','min_10_vgrad','max_10_vgrad',
           'var_10_vgrad','std_10_vgrad','avg_10_vgrad','norm_10_vgrad',
           'min_20_vgrad','max_20_vgrad','var_20_vgrad','std_20_vgrad',
           'avg_20_vgrad','norm_20_vgrad']

In [27]:
for col in testAll:
    dfMinCols = dfCutoffData[['last_point',col]].replace([np.inf, -np.inf], np.nan).dropna()
    testC = dfMinCols[col]
    targC = dfMinCols['last_point']
    lin_regr(testC, targC, targC.name, col)



"last_point","diff_temp",NULL,0.06,0.03,nan,-0.00
"last_point","vgrad",NULL,0.06,0.03,nan,-0.00
"last_point","min_10_diff_temp",NULL,0.06,0.03,0.06,0.00
"last_point","max_10_diff_temp",NULL,0.06,0.03,nan,-0.00
"last_point","var_10_diff_temp",NULL,0.06,0.03,0.04,0.00
"last_point","std_10_diff_temp",NULL,0.06,0.03,0.05,0.00
"last_point","avg_10_diff_temp",NULL,0.06,0.03,0.08,0.01
"last_point","norm_10_diff_temp",NULL,0.08,0.03,0.40,0.16
"last_point","min_20_diff_temp",NULL,0.06,0.03,0.06,0.00




"last_point","max_20_diff_temp",NULL,0.06,0.03,nan,-0.00
"last_point","var_20_diff_temp",NULL,0.06,0.03,0.04,0.00
"last_point","std_20_diff_temp",NULL,0.06,0.03,0.06,0.00
"last_point","avg_20_diff_temp",NULL,0.06,0.03,0.08,0.01
"last_point","norm_20_diff_temp",NULL,0.07,0.03,0.43,0.19
"last_point","min_10_vgrad",NULL,0.06,0.03,nan,-0.00
"last_point","max_10_vgrad",NULL,0.06,0.03,0.06,0.00
"last_point","var_10_vgrad",NULL,0.06,0.03,0.04,0.00




"last_point","std_10_vgrad",NULL,0.06,0.03,0.05,0.00
"last_point","avg_10_vgrad",NULL,0.06,0.03,0.08,0.01
"last_point","norm_10_vgrad",NULL,0.08,0.03,0.40,0.16




"last_point","min_20_vgrad",NULL,0.06,0.03,nan,-0.00
"last_point","max_20_vgrad",NULL,0.06,0.03,0.06,0.00
"last_point","var_20_vgrad",NULL,0.06,0.03,0.04,0.00
"last_point","std_20_vgrad",NULL,0.06,0.03,0.06,0.00
"last_point","avg_20_vgrad",NULL,0.06,0.03,0.08,0.01
"last_point","norm_20_vgrad",NULL,0.07,0.03,0.43,0.19
