In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.ensemble import RandomForestRegressor # A random forest regressor
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html    

from sklearn.decomposition import PCA # For Principal Component Analysis
import matplotlib.pyplot as plt # For creating visualizations


# Not using now: 
# from sklearn.svm import SVR # Epsilon-Support Vector Regression
# from sklearn.svm import LinearSVR # Linear Support Vector Regression
# from sklearn.preprocessing import StandardScaler # Standardize features by removing the mean and scaling to unit variance 
# from umap import UMAP # UMAP is a general purpose manifold learning and dimension reduction algorithm
# from sklearn.manifold import TSNE # t-SNE is a tool to visualize high-dimensional data
# from sklearn.pipeline import make_pipeline 
# from sklearn.model_selection import KFold # Provides train/test indices to split data in train/test sets


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))



# Small program for moving columns: movecol
# From: https://towardsdatascience.com/reordering-pandas-dataframe-columns-thumbs-down-on-standard-solutions
def movecol(df, cols_to_move=[], ref_col='', place='After'):
    
    cols = df.columns.tolist()
    if place == 'After':
        seg1 = cols[:list(cols).index(ref_col) + 1]
        seg2 = cols_to_move
    if place == 'Before':
        seg1 = cols[:list(cols).index(ref_col)]
        seg2 = cols_to_move + [ref_col]
    
    seg1 = [i for i in seg1 if i not in seg2]
    seg3 = [i for i in cols if i not in seg1 + seg2]
    
    return(df[seg1 + seg2 + seg3])


def metric(y_true, y_pred):
    return np.mean(np.sum(np.abs(y_true - y_pred), axis=0)/np.sum(y_true, axis=0))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# https://medium.com/datadriveninvestor/random-forest-regression-9871bc9a25eb

In [None]:
## Load data

# Load brain functional connectivity (fnc) and source-baed morphometry (smb) (loading_df) data  
fnc_df = pd.read_csv("../input/trends-assessment-prediction/fnc.csv") # Load functional connectivity (FNC) data into panda dataframe
loading_df = pd.read_csv("../input/trends-assessment-prediction/loading.csv") # Load source-based morphometry (SMB) data into panda dataframe

fnc_features, loading_features = list(fnc_df.columns[1:]), list(loading_df.columns[1:]) # Save header information in seperate lists
df = fnc_df.merge(loading_df, on="Id") # Merge fnc_df and loading_df while matching data on Id variable
 

# Flag what is training and test data in df    
labels_df = pd.read_csv("../input/trends-assessment-prediction/train_scores.csv") # Load dataset containing trainingscores into panda dataframe
labels_df["is_train"] = True # Add column indicating it is training data for both training and test data
df = df.merge(labels_df, on="Id", how="left") # Merge with df containing both training and test data to indicate which ID is part of training or test dataset

test_df = df[df["is_train"] != True].copy() # Make df with age, cognitive measures, FCN and SMB data for all test subjects (age and cognitive data blanck for test data while FCN and SMB data is given)
train_df = df[df["is_train"] == True].copy()  # Make df with age, cognitive measures, FCN and SMB data for all train subjects

train_df.shape, test_df.shape # check shape of training and test data, as both training and test datasets have equal instances df and test_df should have similar shape



In [None]:
# Reduce FCN to 26 PCA components so that FCN and SBM have same weighting in further analysis
pca = PCA(n_components=26) # set PCA to compute 26 components (SBM has 26 components)

X_fnc = df[fnc_features].values # X_fnc = all functional connectivity measures (both train and test)

X_fnc = pca.fit_transform(X_fnc) # do PCA on X_fnc (both test and train set data) fit data into 26 components
pca_fnc_features = ['pca' + str(number) for number in range(X_fnc.shape[1])] # make header information pca + number for each column of X_fnc
df_pca_fnc_features = pd.DataFrame(X_fnc, columns=pca_fnc_features) # make a dataframe wiht the 26 PCA components from X_fnc and colimn names from pca_fnc_features

df = df.join(df_pca_fnc_features) # join the 26 PCA compoent dataframe to df cointaining all the information (both train and test)




In [None]:
## Make training- and test dataset
test_df = df[df["is_train"] != True].copy() # Make df with age, cognitive measures, FCN and SMB data for all test subjects (age and cognitive data blanck for test data while FCN and SMB data is given)
train_df = df[df["is_train"] == True].copy()  # Make df with age, cognitive measures, FCN and SMB data for all train subjects

test_df = test_df.drop(fnc_features, axis=1) # Remove th FNC measures and just keep the fca 26 PCA components
train_df = train_df.drop(fnc_features, axis=1)

train_df.shape, test_df.shape # check shape of training and test data, as both training and test datasets have equal instances df and test_df should have similar shape

In [None]:
# Reoder columns to make dataframe information more accessible

test_df = movecol(test_df, 
             cols_to_move=['age','is_train','domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2'], 
             ref_col='Id',
             place='After')

train_df = movecol(train_df, 
             cols_to_move=['age','is_train','domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2'], 
             ref_col='Id',
             place='After')

del test_df['is_train']
del train_df['is_train']


test_df.shape, train_df.shape

In [None]:
# Remove NaN from train data
# train_df = train_df.dropna() # Drop the rows where at least one element is missing.

# Replace NaN with column mean in train_df
train_df[['domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']] = train_df[['domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']].fillna(train_df[['domain1_var1', 'domain1_var2', 'domain2_var1', 'domain2_var2']].mean())

# Check for NaN in training data
train_df.isnull().values.any(), train_df.shape, test_df.shape

# Missing values can be replaced by the mean, the median or the most frequent value using the basic sklearn.impute.SimpleImputer
# https://scikit-learn.org/stable/auto_examples/impute/plot_missing_values.html#sphx-glr-auto-examples-impute-plot-missing-values-pyFeature Scaling

In [None]:
## Feature Scaling  
## p.72 in Hands On Machine Learning

# One of the most important transformations you need to apply to your data is feature scaling. With few exceptions, Machine Learning algorithms don’t perform well when the input numerical attributes have very different scales.
# Note that scaling the target values is generally not required.
# There are two common ways to get all attributes to have the same scale: min-max scaling and standardization.
# Scikit-Learn provides a transformer called StandardScaler for standardization. first it subtracts the mean value (so standardized values always have a zero mean), and then it divides by the standard deviation so that the resulting distribution has unit variance.     
# Min-max scaling (many people call this normalization) is quite simple: values are shifted and rescaled so that they end up ranging from 0 to 1. Scikit-Learn provides a transformer called MinMaxScaler for this.

In [None]:
## TEMPORARY FOR INSPECTING THAT THE FOR-LOOP WORKS AS PLANNED
# train_df['domain2_var1'] = train_df['domain2_var1']*100
train_df

In [None]:
## BEGYNNE MED Å LAGE NY WORKBOOK MED EGNE GENERERTE DATA
## LÆRE Å BRUKE SPLIT/TEST FØR K-FOLDS
## FØLGE OPPSKRIFTEN I LÆREBOKEN KAP2


# Fitting Random Forest Regression to the dataset 
from sklearn.metrics import mean_squared_error


# Create regressor object and numpy matrix to store result (xmap)
regressor = RandomForestRegressor(n_estimators = 100, random_state = 42, oob_score=True)  

x = train_df.iloc[:,6:58].values # Features are 26 SMB ICs and 26 FNC PCA components in the training data
y = train_df.iloc[:,1].values # Target (i) is in order: "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"
regressor.fit(x, y)  # Fit (scikit-learn’s name for training) the model using the Features (fixed, x(6:57) and Target (looping, y(i))

age_predictions = regressor.predict(x) # using the model and storing results for testing the models ability to predict training data

regressor_score  = regressor.score(x,y) # getting the score of the regression

oob_scores = regressor.oob_score_ # Store models Out-Of-Bag score. # oob_score p197 Hands on Machine Learning. Gives likelihood of accuracy on the test set in percent. i.e score of 0.91 = 91% chance that model will give correct score.
oob_predictions = regressor.oob_prediction_ # Store models Out-Of-Bag score. # oob_score p197 Hands on Machine Learning. Gives likelihood of accuracy on the test set in percent. i.e score of 0.91 = 91% chance that model will give correct score.

age_predictions_mse = mean_squared_error(y ,age_predictions)
age_predictions_rmse = np.sqrt(age_predictions_mse)

# Evaluation of Model Using Cross-Validation p.76 Hands On Machine Learning
    #Let’s measure this regression model’s RMSE on the whole training
    #set using Scikit-Learn’s mean_squared_error function:

# from sklearn.metrics import mean_squared_error


# Display model accuracy scores
print()
print("regressor_score: " + str(regressor_score)) # Regressor score. The best possible score is 1.0 and it can be negative
print("forest_mse: " + str(age_predictions_mse)) # Mean square error (14.5 år i skrivende stund) 
print("forest_rmse: " + str(age_predictions_rmse)) # Lower = Better. Root Mean Square Error (RMSE) is the standard deviation of the residuals (prediction errors). Residuals are a measure of how far from the regression line data points are; RMSE is a measure of how spread out these residuals are. In other words, it tells you how concentrated the data is around the line of best fit.
print("Oob scores: " + str(oob_scores))
print("Oob predictions: " + str(oob_predictions))
#Based on a rule of thumb, it can be said that RMSE values between 0.2 and 0.5 shows that the model can relatively predict the data accurately. In addition, Adjusted R-squared more than 0.75 is a very good value for showing the accuracy. In some cases, Adjusted R-squared of 0.4 or more is acceptable as well.




# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor.score

In [None]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_predict
predictions = cross_val_predict(regressor, x, y, cv=10)
predictions

In [None]:
predictions.shape

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regressor, x, y, cv=10)
age_predictions_CV_rmse_scores = np.sqrt(scores)

def display_scores(scores):
    print("Scores:", scores) # Array of scores of the estimator for each run of the cross validation.
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(age_predictions_CV_rmse_scores)

# https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

In [None]:
# from sklearn.model_selection import cross_val_score
# scores = cross_val_score(regressor, x, y, scoring="neg_mean_squared_error", cv=10)
# age_predictions_rmse_scores = np.sqrt(-scores)
# def display_scores(scores):
#    print("Scores:", scores)
#    print("Mean:", scores.mean())
#    print("Standard deviation:", scores.std())

# display_scores(age_predictions_rmse_scores)

In [None]:
# and let’s separate the predictors and the labels since we don’t necessarily want to apply
# the same transformations to the predictors and the target values (note that drop()
# creates a copy of the data and does not affect strat_train_set):
# housing = strat_train_set.drop("median_house_value", axis=1)
# housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
# Fitting Random Forest Regression to the dataset 

# Create regressor object and numpy matrix to store result (xmap)
regressor = RandomForestRegressor(n_estimators = 100, random_state = 42, oob_score=True)  
targets = 3
xmap = np.zeros(shape=(len(test_df),targets))
xmap_train = np.zeros(shape=(len(train_df),targets))
feature_importance = np.zeros(shape=(52,62))
metrics = [0] * targets 
oob_scores = [0] * targets 
regressor_score = [0] * targets 
i = 0 # initialize counter 

# fit the regressor with x and y data (Training the model)
for i in range(1,3):
    x = train_df.iloc[:,6:58].values # Features are 26 SMB ICs and 26 FNC PCA components in the training data
    y = train_df.iloc[:,i].values # Target (i) is in order: "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"
    regressor.fit(x, y)  # Fit (scikit-learn’s name for training) the model using the Features (fixed, x(6:57) and Target (looping, y(i))
    oob_scores[i] = regressor.oob_score_ # Store models Out-Of-Bag score. # oob_score p197 Hands on Machine Learning. Gives likelihood of accuracy on the test set in percent. i.e score of 0.91 = 91% chance that model will give correct score.
    oob_predictions[i] = regressor.oob_prediction_ # Store predictions made by Out-Of-Bag process
    
# Evaluate Model based on Training data. Compare given (known) training scores to predicted scores from training data using function metric defined at top of script.
    xmap_train[:,i] = regressor.predict(train_df.iloc[:, 6:58].values) # using the model and storing results for testing the models ability to predict training data
    xmap_train[:,0] = train_df.iloc[:, 0].values # Add subj ID to xmap first (0) column
    
    #Evaluate model using Metrics as defined at start of code
    regressor_score[i] = regressor.score(x, y)
    
    y_true = train_df.iloc[:,i] # target values from tranining set
    y_pred = xmap_train[:,i] # predicted values from the model for targets based on the training dataset
    metrics[i]= metric(y_true, y_pred) # Storing information on the comparison between given (known) training scores to predicted scores from training data using function metric defined at top of script (lower score is better)

    feature_importance[:,i] = regressor.feature_importances_[:] # Storing information about which feature is important for making the predictions
    list(train_df.columns)
    
    # Display model accuracy scores
    print(i)
    print("For Target: " + str(train_df.columns[i]))
    print("Regressor score: " + str(regressor_score[i]))
    print("Metrics: " + str(metrics[i]))
    print("Oob scores: " + str(oob_scores[i]))
    print(" ")
    

In [None]:
# Fine-Tune Your Model p.79 Hands On Machine Learning
# Let’s assume that you now have a shortlist of promising models. You now need to fine-tune them. Let’s look at a few ways you can do that.
# Grid Search
# One way to do that would be to fiddle with the hyperparameters manually, until you
# find a great combination of hyperparameter values. This would be very tedious work,
# and you may not have time to explore many combinations.
# Instead you should get Scikit-Learn’s GridSearchCV to search for you. All you need to
# do is tell it which hyperparameters you want it to experiment with, and what values to
# try out, and it will evaluate all the possible combinations of hyperparameter values,
# using cross-validation. For example, the following code searches for the best combination
# of hyperparameter values for the RandomForestRegressor:

In [None]:
train_df.columns[1]

In [None]:
 list(train_df.columns)

In [None]:
a = 

In [None]:
regressor.score(x, y)

In [None]:
# Improving Model

from sklearn.feature_selection import VarianceThreshold
# https://scikit-learn.org/stable/modules/feature_selection.html

In [None]:
y_test.shape

In [None]:
for k, (train, test) in enumerate(k_fold.split(X, y)):


In [None]:
# Fitting Random Forest Regression to the dataset 

# Create regressor object and numpy matrix to store result (xmap)
regressor = RandomForestRegressor(n_estimators = 100, random_state = 42) 
xmap = np.zeros(shape=(len(test_df),6))
feature_importance = np.zeros(shape=(52,6))
i = 0 # initialize counter 


# fit the regressor with x and y data (Training the model)
for i in range(1,6):
    x =train_df.iloc[:,6:58].values # Features are 26 SMB ICs and 26 FNC PCA components in the training data
    y = train_df.iloc[:,i].values # Target (i) is in order: "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"
    regressor.fit(x, y)  # Fit (scikit-learn’s name for training) the model using the Features (fixed, x(6:57) and Target (looping, y(i))
    xmap[:,i] = regressor.predict(test_df.iloc[:, 6:58].values) # Make the model predict "age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2" from the test data 26 SMB ICs and 26 FNC PCA components and store results in xmap
    xmap[:,0] = test_df.iloc[:, 0].values # Add subj ID to xmap first column
    feature_importance[:,i] = regressor.feature_importances_[:]
    print(i)

In [None]:
# Make dataframe with results 
estimations_df = pd.DataFrame(xmap, columns = ["Id","age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"])
estimations_df['Id'] = estimations_df['Id'].astype(str)
estimations_df['Id'] = estimations_df['Id'].str[:5] # remove ".0" from subject ID
estimations_df

In [None]:
# Transform dataframe to competition format and save as csv file
df_out = estimations_df.melt(id_vars='Id',value_name='Predicted')
df_out = df_out.sort_values(by=['Id','variable'])
df_out["Id"] = df_out["Id"].astype(str) + '_' + df_out["variable"].astype(str)
df_out = df_out.drop(['variable'], axis=1)

df_out.to_csv('df_out.csv', index=False) # Print csv file

In [None]:
# Show df_out 
with pd.option_context('display.max_rows', None, 'display.max_columns', None):  
    print(df_out)


In [None]:
# Create dataframe with feature importance
feature_importance_df = pd.DataFrame(importance, columns = ["Feature","age", "domain1_var1", "domain1_var2", "domain2_var1", "domain2_var2"])
feature_importance_df["Feature"] = train_df.columns[6:58]

In [None]:
feature_importance_df.sort_values(["domain2_var2"], ascending=False)

In [None]:

## TESTING K-FOLDS


In [None]:
# Note:
# It is always suggested that the value of k should be 10 as the lower value 
# of k is takes towards validation and higher value of k leads to LOOCV method.

In [None]:
import numpy as np
from sklearn.model_selection import KFold
X = train_df.iloc[:,6:58].values
y = train_df.iloc[:,1].values
kf = KFold(n_splits=5)
kf.get_n_splits(X)

print(kf)

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Fitting Random Forest Regression to the dataset 
cv_regressor = RandomForestRegressor(n_estimators = 100, random_state = 42) 

cv_regressor.fit(X_train, y_train) 



In [None]:
y_true = y
metric(y_true, y_pred)

In [None]:
y_test.shape

In [None]:
y_train.shape