In [2]:
import pandas as pd, numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge

# Clean

In [4]:
train = pd.read_csv('https://raw.githubusercontent.com/ryskgit/QTW_CaseStudy/main/Case%20Study%201/Raw%20Data%201/train.csv')
material = pd.read_csv('https://raw.githubusercontent.com/ryskgit/QTW_CaseStudy/main/Case%20Study%201/Raw%20Data%201/unique_m.csv')
df = train.merge(material, on='critical_temp', how='left')

In [5]:
df.drop(columns='material', inplace=True)

In [6]:
single_vals = df.columns[df.eq(df.iloc[0]).all()].tolist()
print (single_vals)

['He', 'Ne', 'Ar', 'Kr', 'Xe', 'Pm', 'Po', 'At', 'Rn']


In [7]:
df.drop(columns=single_vals, inplace=True)

In [8]:
df.shape

(715607, 159)

In [9]:
df.isna().sum().sum()

0

In [10]:
df.duplicated().sum()

1622

In [12]:
df.drop_duplicates(inplace=True) # dropped but we can discuss what to do them with later

In [13]:
df.shape

(713985, 159)

In [14]:
df.head()

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,W,Re,Os,Ir,Pt,Au,Hg,Tl,Pb,Bi
0,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 713985 entries, 0 to 715606
Columns: 159 entries, number_of_elements to Bi
dtypes: float64(156), int64(3)
memory usage: 871.6 MB


# Preprocess

In [16]:
X = df.drop(columns="critical_temp")
y = df["critical_temp"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=77)

## No scaling

In [17]:
lr = LinearRegression()

In [18]:
lr.fit(X_train, y_train)

LinearRegression()

In [20]:
cross_val_score(lr, X_train, y_train, cv=10, scoring='r2').mean() # r2 of internal 10 fold cv

0.7385141069735254

In [21]:
np.abs(cross_val_score(lr, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error').mean()) # rmse of internal 10 fold cv

16.465695052030608

## Standard Scaler

In [22]:
pipe = Pipeline([("scaler", StandardScaler()), ("lr", LinearRegression())])

In [23]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('lr', LinearRegression())])

In [24]:
cross_val_score(pipe, X_train, y_train, cv=10, scoring='r2').mean() # r2 of internal 10 fold cv

0.7385141069735293

In [25]:
np.abs(cross_val_score(pipe, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error').mean()) # rmse of internal 10 fold cv

16.465695052030487

## Min max scaler

In [30]:
pipe2 = Pipeline([("min_max_scaler", MinMaxScaler()), ("lr", lr)])

In [31]:
pipe2.fit(X_train, y_train)

Pipeline(steps=[('min_max_scaler', MinMaxScaler()), ('lr', LinearRegression())])

In [32]:
cross_val_score(pipe2, X_train, y_train, cv=10, scoring='r2').mean() # r2 of internal 10 fold cv

0.7385141069735293

In [29]:
np.abs(cross_val_score(pipe2, X_train, y_train, cv=10, scoring='neg_root_mean_squared_error').mean()) # rmse of internal 10 fold cv

16.46569505203049

# Modeling

## L1 / Lasso (will redo with gridsearch later today)

In [39]:
lasso = Lasso(alpha=0.1)

lasso_pipeline = Pipeline(steps=[
    ("scaler", StandardScaler()),
    ('model', lasso)
])

In [40]:
_ = lasso_pipeline.fit(X_train, y_train)

In [41]:
lasso_pipeline.score(X_test, y_test)

0.7210531214770268

In [44]:
preds = lasso_pipeline.predict(X_test)
np.sqrt(mean_squared_error(y_test, preds)) # 

16.996925199689926

## L2 / Ridge