Choosing a Dataset


In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.data

--2023-09-18 03:40:51--  https://archive.ics.uci.edu/ml/machine-learning-databases/parkinsons/telemonitoring/parkinsons_updrs.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘parkinsons_updrs.data.6’

parkinsons_updrs.da     [ <=>                ] 889.90K  5.39MB/s    in 0.2s    

2023-09-18 03:40:51 (5.39 MB/s) - ‘parkinsons_updrs.data.6’ saved [911261]



In [None]:
import numpy as np
import pandas as pd
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

data = pd.read_table('parkinsons_updrs.data', sep=',', header=0)


In [None]:
data.head()

Unnamed: 0,subject#,age,sex,test_time,motor_UPDRS,total_UPDRS,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,...,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
0,1,72,0,5.6431,28.199,34.398,0.00662,3.4e-05,0.00401,0.00317,...,0.23,0.01438,0.01309,0.01662,0.04314,0.01429,21.64,0.41888,0.54842,0.16006
1,1,72,0,12.666,28.447,34.894,0.003,1.7e-05,0.00132,0.0015,...,0.179,0.00994,0.01072,0.01689,0.02982,0.011112,27.183,0.43493,0.56477,0.1081
2,1,72,0,19.681,28.695,35.389,0.00481,2.5e-05,0.00205,0.00208,...,0.181,0.00734,0.00844,0.01458,0.02202,0.02022,23.047,0.46222,0.54405,0.21014
3,1,72,0,25.647,28.905,35.81,0.00528,2.7e-05,0.00191,0.00264,...,0.327,0.01106,0.01265,0.01963,0.03317,0.027837,24.445,0.4873,0.57794,0.33277
4,1,72,0,33.642,29.187,36.375,0.00335,2e-05,0.00093,0.0013,...,0.176,0.00679,0.00929,0.01819,0.02036,0.011625,26.126,0.47188,0.56122,0.19361


In [None]:
#there are no na values here. we can clearly see there are no na values in any columns.
data.isnull().sum()

subject#         0
age              0
sex              0
test_time        0
motor_UPDRS      0
total_UPDRS      0
Jitter(%)        0
Jitter(Abs)      0
Jitter:RAP       0
Jitter:PPQ5      0
Jitter:DDP       0
Shimmer          0
Shimmer(dB)      0
Shimmer:APQ3     0
Shimmer:APQ5     0
Shimmer:APQ11    0
Shimmer:DDA      0
NHR              0
HNR              0
RPDE             0
DFA              0
PPE              0
dtype: int64

In [None]:
#there are no duplicates in the dataset.as the duplicates count is zero.
data.duplicated().sum()

0

In [None]:
#there are no categorical values in the dataset.u can see clealry below
data.dtypes

subject#           int64
age                int64
sex                int64
test_time        float64
motor_UPDRS      float64
total_UPDRS      float64
Jitter(%)        float64
Jitter(Abs)      float64
Jitter:RAP       float64
Jitter:PPQ5      float64
Jitter:DDP       float64
Shimmer          float64
Shimmer(dB)      float64
Shimmer:APQ3     float64
Shimmer:APQ5     float64
Shimmer:APQ11    float64
Shimmer:DDA      float64
NHR              float64
HNR              float64
RPDE             float64
DFA              float64
PPE              float64
dtype: object

In [None]:

# Calculate the correlation matrix
correlation_matrix = data.corr()

# Extract the correlation of each feature with the Total UPDRS score
total_updrs_correlation = correlation_matrix['total_UPDRS']

# Sort the correlations in descending order
sorted_correlations = total_updrs_correlation.abs().sort_values(ascending=False)

# Display the top correlated features
print("Top Correlations with Total UPDRS:")
print(sorted_correlations)


Top Correlations with Total UPDRS:
total_UPDRS      1.000000
motor_UPDRS      0.947231
age              0.310290
subject#         0.253643
HNR              0.162117
RPDE             0.156897
PPE              0.156195
Shimmer:APQ11    0.120838
DFA              0.113475
Shimmer(dB)      0.098790
sex              0.096559
Shimmer          0.092141
Shimmer:APQ5     0.083467
Shimmer:DDA      0.079363
Shimmer:APQ3     0.079363
test_time        0.075263
Jitter(%)        0.074247
Jitter(Abs)      0.066927
Jitter:DDP       0.064027
Jitter:RAP       0.064015
Jitter:PPQ5      0.063352
NHR              0.060952
Name: total_UPDRS, dtype: float64


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

class LinearRegression:
    def __init__(self, learning_rate=0.001, num_iterations=50, verbose=False):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.verbose = verbose

    def load_data(self, data_path):
        # Load the preprocessed dataset
        self.data = pd.read_table(data_path, sep=',', header=0)

    def split_data(self, test_size=0.2):
        X = self.data.drop(columns=['total_UPDRS'])  # Features
        y = self.data['total_UPDRS']  # Target variable

        # Split the dataset into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    def add_intercept(self, X):
        # Add a column of ones for the intercept term
        return np.column_stack([X, np.ones(X.shape[0])])

    def fit(self, X, y):
        X = self.add_intercept(X)
        num_samples, num_features = X.shape

        # Initialize weights with zeros
        self.weights = np.zeros(num_features)

        # Gradient Descent
        self.mse_history = []  # To store MSE values during training

        for iteration in range(self.num_iterations):
            y_pred = np.dot(X, self.weights)
            error = y_pred - y

            # Calculate the gradient
            gradient = (1 / num_samples) * np.dot(X.T, error)

            # Update weights
            self.weights -= self.learning_rate * gradient

            if self.verbose and iteration % 100 == 0:
                mse = np.mean(error ** 2)
                r2 = 1 - (np.sum(error ** 2) / np.sum((y - np.mean(y)) ** 2))
                print(f"Iteration {iteration}: MSE = {mse}, R2 = {r2}")

            # Append MSE to the history
            mse = np.mean(error ** 2)
            self.mse_history.append(mse)

    def predict(self, X):
        X = self.add_intercept(X)
        return np.dot(X, self.weights)

    def evaluate(self, X_test, y_test):
        y_pred = self.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        return mse, r2

if __name__ == "__main__":
    # Load data
    data_path= 'parkinsons_updrs.data'
    model = LinearRegression()

    # Load the preprocessed dataset
    model.load_data(data_path)

    # Split the dataset into training and testing sets
    model.split_data()

    # Hyperparameter optimization and logging
    hyperparameter_grid = {
        'learning_rate': [0.000001, 0.000002, 0.000003],
        'num_iterations': [1000,1500,2000]
    }

    best_mse = float('inf')
    best_params = None

    with open('hyperparameter_log1.txt', 'w') as log_file:
        for learning_rate in hyperparameter_grid['learning_rate']:
            for num_iterations in hyperparameter_grid['num_iterations']:
                # Create a new instance of LinearRegression for each hyperparameter combination
                model_instance = LinearRegression(learning_rate=learning_rate, num_iterations=num_iterations)
                model_instance.fit(model.X_train,model.y_train)

                # Evaluate the model on the test data
                mse, r2 = model_instance.evaluate(model.X_test,model.y_test)

                # Log the results
                log_file.write(f"Learning Rate: {learning_rate}, Num Iterations: {num_iterations}\n")
                log_file.write(f"MSE (Train): {mean_squared_error(model.y_train, model_instance.predict(model.X_train))}\n")
                log_file.write(f"MSE (Test): {mse}\n")
                log_file.write(f"R2 (Test): {r2}\n\n")

                # Check if this model has the lowest test MSE
                if mse < best_mse:
                    best_mse = mse
                    best_params = {'learning_rate': learning_rate, 'num_iterations': num_iterations}

    print("Best Hyperparameters:")
    print(best_params)
    print("Best Test MSE:", best_mse)

    # Plot MSE vs. number of iterations
    my_formatted_list = [ math.ceil(elem*100)/100 for elem in model_instance.mse_history ]
    print(my_formatted_list)
    plt.plot(range(1, len(model_instance.mse_history) + 1),my_formatted_list)
    plt.xlabel('Number of Iterations')
    plt.ylabel('Mean Squared Error (MSE)')
    plt.title('MSE vs. Number of Iterations')
    plt.show()

    # Plot output variable (total_UPDRS) against an important attribute (e.g., age)
    plt.scatter(model.X_test['age'], model.y_test, label='Actual')
    plt.scatter(model.X_test['age'], model_instance.predict(model.X_test), label='Predicted', marker='x')
    plt.xlabel('Age')
    plt.ylabel('Total UPDRS')
    plt.title('Total UPDRS vs. Age')
    plt.legend()
    plt.show()


    plt.scatter(model.X_test['motor_UPDRS'], model.y_test, label='Actual')
    plt.scatter(model.X_test['motor_UPDRS'], model_instance.predict(model.X_test), label='Predicted', marker='x')
    plt.xlabel('motor_UPDRS ')
    plt.ylabel('Total UPDRS')
    plt.title('Total UPDRS vs. motor_UPDRS ')
    plt.legend()
    plt.show()

      # Residual Plot vs. Predicted Values
    y_pred = model_instance.predict(model.X_test)
    residuals = model.y_test - y_pred
    plt.scatter(y_pred, residuals)
    plt.xlabel("Predicted Values")
    plt.ylabel("Residuals")
    plt.title("Residual Plot vs. Predicted Values")
    plt.axhline(y=0, color='r', linestyle='--')
    plt.show()

    # Residual Histogram
    plt.hist(residuals, bins=30)
    plt.xlabel("Residuals")
    plt.ylabel("Frequency")
    plt.title("Residual Histogram")
    plt.show()

    # Actual vs. Predicted Values Plot
    plt.scatter(model.y_test, y_pred)
    plt.xlabel("Actual Values")
    plt.ylabel("Predicted Values")
    plt.title("Actual vs. Predicted Values")
    plt.show()






