# Splitting data

## Code for splitting data in train and test sets

In [1]:
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
"""Functions def"""
def read_data(raw_clinical_note):
    """Function for loading data"""
    data = pd.read_csv(raw_clinical_note, header=0,na_filter=True)
    return data

def scaler(x_train, x_test,logs_file):
    """Function to standardize features """
    scaler = StandardScaler().fit(x_train)
    x_train_scaled = scaler.transform(x_train)
    x_test_scaled = scaler.transform(x_test)

    pickle.dump(scaler, open(logs_file+'scaler', 'wb'))
    return x_train_scaled, x_test_scaled

In [3]:
"""Path"""
path = "C:/Users/Salvador/Modelo_COVID19/Libretas manuscrito/BCM Infectius diseases/"
raw_data = path + "data_final_mor.csv"
data = pd.read_csv(raw_data)
print ("Total records", data.shape[0])

Total records 11564


In [4]:
X = data.drop(['Unnamed: 0','Mortality'],axis=1)
y = data['Mortality']
y = y.values.reshape(y.shape[0],1)
feature_list = X.columns
print("X:", X.shape)
print("Y:", y.shape)

X: (11564, 34)
Y: (11564, 1)


In [5]:
"""Splitting data: train set and test set"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=242,stratify=y)
print("X_train:", X_train.shape, "y_train:",y_train.shape)
print("X_test:", X_test.shape, "y_test:",y_test.shape)

X_train: (9251, 34) y_train: (9251, 1)
X_test: (2313, 34) y_test: (2313, 1)


In [6]:
"""Scaling data"""
X_train, X_test = scaler(X_train, X_test,path)
print("X_train:", X_train.shape,"X_test:", X_test.shape)

X_train: (9251, 34) X_test: (2313, 34)


In [7]:
"""From numpy to dataframe"""
X_train = pd.DataFrame(X_train,columns = feature_list)
X_test = pd.DataFrame(X_test,columns = feature_list)
y_train = pd.DataFrame(y_train, columns=["lethality"])
y_test = pd.DataFrame(y_test, columns=["lethality"])

"""Saving datasets"""
X_train.to_csv(path+"X_train.csv")
X_test.to_csv(path+"X_test.csv")
y_train.to_csv(path+"y_train.csv")
y_test.to_csv(path+"y_test.csv")