In [1]:
import pandas as pd
import numpy as np
from math import sqrt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

def multiclass(var):
    for i in range(0,len(var)):
        if var[i] <= 10**-1:
            var[i] = 5
        
        elif 10**-1 < var[i] <= 10**0:
            var[i] = 4
        
        elif 10**0 < var[i] <= 10**1:
            var[i] = 3
            
        elif 10**1 < var[i] <= 10**2:
            var[i] = 2
            
        else:
            var[i] = 1
    return pd.to_numeric(var, downcast = 'integer')

db = pd.read_csv('data/lc_db_processed.csv').drop(columns = ['Unnamed: 0'])

X = db.drop(columns = 'conc1_mean')
y = np.where(db['conc1_mean']>1,1,0)
y_reg = db['conc1_mean']
y_mul = multiclass(db['conc1_mean'].values)

# Binary
## First dumb model predict the most frequent class

In [2]:
pd.Series(y).value_counts()

1    16673
0    10163
dtype: int64

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, random_state = 42, stratify = y)

y_pred_freq = np.ones(len(X_test))
print(accuracy_score(y_pred_freq, y_test))
print(sqrt(mean_squared_error(y_pred_freq, y_test)))

0.6212737127371274
0.6154074156710111


## Second dumb model random predict the class

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.33, random_state = 42, stratify = y)
y_pred_naive = np.random.choice(2,len(y_test))

print(accuracy_score(y_pred_naive, y_test))
print(sqrt(mean_squared_error(y_pred_naive, y_test)))

0.4897244805781391
0.7143357189878305


# Multiclass
## First dumb classifier predict the most frequent class

In [7]:
pd.Series(y_mul).value_counts()

3    6831
2    6196
4    5152
5    5011
1    3646
dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X,y_mul, test_size = 0.33, random_state = 42, stratify = y_mul)

y_pred_freq = np.ones(len(X_test))*3
print(accuracy_score(y_pred_freq, y_test))
print(sqrt(mean_squared_error(y_pred_freq, y_test)))

0.254516711833785
1.3089315170590465


## Second dumb classifier random predict the class

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X,y_mul, test_size = 0.33, random_state = 42, stratify = y_mul)

y_pred_naive = np.random.choice(np.arange(1,6),len(X_test))
print(accuracy_score(y_pred_naive, y_test))
print(sqrt(mean_squared_error(y_pred_naive, y_test)))

0.21375338753387535
1.9016288838582882


# Regression

In [5]:
X.drop(columns = X.select_dtypes('object').columns,inplace = True)

In [6]:
scaler = MinMaxScaler(feature_range=(0,1))
scaler.fit(np.array(y).reshape(-1,1))
y = scaler.transform(np.array(y).reshape(-1,1))

In [8]:
from sklearn.linear_model import RidgeCV

X_train, X_test, y_train, y_test = train_test_split(X,y_reg, test_size = 0.33, random_state = 42)

clf = RidgeCV()
clf.fit(X_train,y_train)
y_pred_reg = clf.predict(X_test)
print(mean_squared_error(y_pred_reg, y_test))

7221098.15852749
