# OSIC - Linear Regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from pathlib import Path

## Data loading and preprocessing

In [None]:
ROOT = Path("../input/osic-pulmonary-fibrosis-progression")

train = pd.read_csv(ROOT / 'train.csv')
test = pd.read_csv(ROOT / 'test.csv')
sub = pd.read_csv(ROOT / 'sample_submission.csv')

train.drop_duplicates(subset=['Patient', 'Weeks'], keep=False, inplace=True)

train.head()

In [None]:
test.head()

In [None]:
# create training data
trainData = []
for p in train['Patient'].unique():
    patientData = train[train['Patient'] == p]
    firstMeasure = list(patientData.iloc[0, :].values)
        
    for i, week in enumerate(patientData['Weeks'].iloc[1:]):
        fvc = patientData.iloc[i, 2]
        trainDataPoint = firstMeasure + [week, fvc]
        trainData.append(trainDataPoint)
    
        
trainData = pd.DataFrame(trainData)

trainData.columns = ['PatientID', 'first_week', 'first_FVC', 'first_Percent', 'Age', 'Sex', 'SmokingStatus'] + ['target_week', 'target_FVC']
#trainData['delta_week'] = trainData['target_week'] - trainData['first_week']
trainData.drop(columns = ['first_Percent'], inplace = True)#, 'target_week', 'first_week'], inplace = True)

trainData.head()

In [None]:
# create testing data
subSplit = np.array(list(sub['Patient_Week'].apply(lambda x: x.split('_')).values))
testData = []
for p in np.unique(subSplit[:, 0]):
    patientData = test[test['Patient'] == p]
    firstMeasure = list(patientData.iloc[0, :].values)
    for week in subSplit[subSplit[:, 0] == p, 1]:
        testDataPoint = firstMeasure + [week]
        testData.append(testDataPoint)
testData = pd.DataFrame(testData)
testData.columns = ['PatientID', 'first_week', 'first_FVC', 'first_Percent', 'Age', 'Sex', 'SmokingStatus'] + ['target_week']

#testData['delta_week'] = testData['target_week'].map(int) - testData['first_week']
testData.drop(columns = ['first_Percent'], inplace = True)#, 'first_week'], inplace = True)

In [None]:
# fe engineering
# trainData.drop(columns = ['PatientID'], inplace = True)
# testData.drop(columns = ['PatientID'], inplace = True)

le = LabelEncoder()

trainData['Sex'] = le.fit_transform(trainData['Sex'])
testData['Sex'] = le.transform(testData['Sex'])

trainData['SmokingStatus'] = le.fit_transform(trainData['SmokingStatus'])
testData['SmokingStatus'] = le.transform(testData['SmokingStatus'])

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

model = LinearRegression()
model.fit(trainData.drop(columns = ['PatientID', 'target_FVC']), trainData['target_FVC'])
prediction = model.predict(testData.drop(columns = ['PatientID']))#, 'target_week']))

In [None]:
sub = []
for i in range(testData.shape[0]):
    patient, week, pred = testData.loc[i, 'PatientID'], testData.loc[i, 'target_week'], prediction[i]
    confidence = np.random.uniform(0, 300)
    sub.append([patient + '_' + str(week), pred, confidence])
sub = pd.DataFrame(sub)
sub.columns = ['Patient_Week', 'FVC', 'Confidence']

In [None]:
sub.to_csv('submission.csv', index=False)

In [None]:
sub

In [None]:
cheat_test = sub.copy()
cheat_test

In [None]:
cheat_test[['Patient','Week']] = cheat_test['Patient_Week'].str.split('_',expand=True)

In [None]:
cheat_test['Week'] = cheat_test['Week'].apply(pd.to_numeric)

In [None]:
cheat_test[(cheat_test.Patient == 'ID00419637202311204720264') & (cheat_test.Week.isin([6, ]))]

In [None]:
avg = []

for patient in test['Patient'].unique():
    
    print(patient)
    
    true_weeks = train[train.Patient == patient].Weeks.values
    true_fvc = train[train.Patient == patient].FVC.values
        
    cheat_fvc = cheat_test[(cheat_test.Patient == patient) & (cheat_test.Week.isin(true_weeks))].FVC.values
    
    avg.append(abs(sum(cheat_fvc - true_fvc)/len(cheat_fvc)))


In [None]:
avg = np.array(avg)

In [None]:
avg.mean()

In [None]:
avg.std()

In [None]:
avg