In [39]:
## Import the required libraries.
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch

In [50]:
## Using pandas, load the text file. Since the previously downloaded text file has the same formatting as a CSV file, you can read it using the read_csv() function. Make sure to turn the header argument to None.
data = pd.read_csv("C:/Users/excel/Downloads/yearpredictionmsd/YearPredictionMSD.txt", header=None, nrows=50000)
data.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


In [52]:
## Verify whether any qualitative data is present in the dataset.

cols = data.columns  # total number of columns
num_cols = data._get_numeric_data().columns  # tital number of numerical columns
list(set(cols) - set(num_cols)) # results in number of categorical (non-numerical) features, if existant, need to be removed.

## The resulting list is empty, which indicates that there are no categorical features to deal with.

[]

In [54]:
## Check for missing values.

data.isnull().sum().sum()
# This command counts the number of null values in each column. For the dataset in use, there should not be any missing values.

0

In [55]:
## Check for outliers.
    
outliers = {}
for i in range(data.shape[1]):
    # 3-sigma limits
    min_t = data[data.columns[i]].mean() - (3 *  
            data[data.columns[i]].std())
    max_t = data[data.columns[i]].mean() + (3 *  
            data[data.columns[i]].std())
    
    # counting number of outliers present
    count = 0
    for j in data[data.columns[i]]:
        if j < min_t or j > max_t:
            count += 1
            
    # percentage of outliers as per the data
    percentage = count / data.shape[0]
    outliers[data.columns[i]] = "%.3f" % percentage
    
print(outliers)

{0: '0.019', 1: '0.010', 2: '0.011', 3: '0.011', 4: '0.015', 5: '0.008', 6: '0.010', 7: '0.010', 8: '0.010', 9: '0.011', 10: '0.005', 11: '0.010', 12: '0.010', 13: '0.015', 14: '0.017', 15: '0.016', 16: '0.016', 17: '0.014', 18: '0.016', 19: '0.013', 20: '0.016', 21: '0.015', 22: '0.013', 23: '0.015', 24: '0.013', 25: '0.018', 26: '0.017', 27: '0.017', 28: '0.018', 29: '0.017', 30: '0.016', 31: '0.015', 32: '0.017', 33: '0.016', 34: '0.015', 35: '0.016', 36: '0.017', 37: '0.017', 38: '0.016', 39: '0.015', 40: '0.017', 41: '0.017', 42: '0.015', 43: '0.014', 44: '0.016', 45: '0.015', 46: '0.017', 47: '0.017', 48: '0.017', 49: '0.015', 50: '0.016', 51: '0.015', 52: '0.015', 53: '0.015', 54: '0.016', 55: '0.018', 56: '0.017', 57: '0.016', 58: '0.013', 59: '0.016', 60: '0.016', 61: '0.015', 62: '0.016', 63: '0.017', 64: '0.017', 65: '0.016', 66: '0.018', 67: '0.015', 68: '0.018', 69: '0.016', 70: '0.017', 71: '0.016', 72: '0.018', 73: '0.015', 74: '0.016', 75: '0.015', 76: '0.017', 77: '0.0

In [69]:
## Separate the features from the target data.
X = data.iloc[:,1:]
Y = data.iloc[:,0]

print(X.head())
print(Y.head())

         1         2         3         4         5         6         7   \
0  49.94357  21.47114  73.07750   8.74861 -17.40628 -13.09905 -25.01202   
1  48.73215  18.42930  70.32679  12.94636 -10.32437 -24.83777   8.76630   
2  50.95714  31.85602  55.81851  13.41693  -6.57898 -18.54940  -3.27872   
3  48.24750  -1.89837  36.29772   2.58776   0.97170 -26.21683   5.05097   
4  50.97020  42.20998  67.09964   8.46791 -15.85279 -16.81409 -12.48207   

         8         9        10  ...        81         82        83        84  \
0 -12.23257   7.83089 -2.46783  ...  13.01620  -54.40548  58.99367  15.37344   
1  -0.92019  18.76548  4.59210  ...   5.66812  -19.68073  33.04964  42.87836   
2  -2.35035  16.07017  1.39518  ...   3.03800   26.05866 -50.92779  10.93792   
3 -10.34124   3.55005 -6.36304  ...  34.57337 -171.70734 -16.96705 -46.67617   
4  -9.37636  12.63699  0.93609  ...   9.92661  -55.95724  64.92712 -17.72522   

         85        86         87        88         89        90  
0 

In [70]:
## Rescale the data using the standardization methodology.
# To rescale the data using the standardization methodology, you need to transform the data so that it has a mean of 0 and a standard deviation of 1. This process is often called standardization or Z-score normalization.
# normalization vs. standardization are both techniques to scale data, but are different
 
# X_scaled = StandardScaler().fit_transform(X)  ## not preffered because
# returns a numpy array after scaling, and hence wouldn't have a head
# instead we can standardise without using this

# used standardization
X = (X - X.mean())/X.std()
X.head()

# Print the shapes of the tensors to verify
# print("X shape:", X_scaled.head())  # features

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,81,82,83,84,85,86,87,88,89,90
0,1.082657,0.382437,1.841985,0.459652,-0.480074,-0.282606,-1.590785,-1.300854,0.378336,-0.683719,...,-0.086005,0.099339,0.148291,-0.255625,0.040944,-0.362616,0.524542,-0.467668,-0.247579,0.036872
1,0.880874,0.321953,1.763666,0.717085,-0.165507,-1.188896,0.777905,0.122576,1.420531,0.401198,...,-0.316635,0.301448,-0.063611,0.031855,-0.655124,-0.443921,0.536517,0.573191,0.209887,1.155171
2,1.251484,0.588929,1.350579,0.745944,0.000857,-0.703401,-0.066747,-0.05738,1.163637,-0.090081,...,-0.399185,0.567666,-0.749508,-0.301984,-0.034072,0.227059,-0.528413,-0.335333,0.109957,-0.095865
3,0.800148,-0.08224,0.794774,0.081829,0.336246,-1.295366,0.517369,-1.062869,-0.029679,-1.282306,...,0.590596,-0.583396,-0.472129,-0.904164,-0.820141,0.577357,-0.282033,0.412329,0.961849,0.789313
4,1.25366,0.794806,1.671781,0.442438,-0.411071,-0.569426,-0.712128,-0.941459,0.836414,-0.16063,...,-0.182976,0.090307,0.196753,-0.60157,-0.123595,-0.223957,0.429005,0.260874,0.19516,1.238096


In [71]:
## Split the data into three sets: training, validation, and testing. Use the approach of your preference.

# Split into training+validation and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, shuffle=True)

# Further split training+validation into separate training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, shuffle=True)


## didn't understand the logic::
# 2) ## Use indexing to split the shuffled dataset into the three sets, for both the features and the target data:
# x_train = X_shuffle.iloc[:train_end,:]
# y_train = Y_shuffle.iloc[:train_end]
# x_dev = X_shuffle.iloc[train_end:dev_end,:]
# y_dev = Y_shuffle.iloc[train_end:dev_end]
# x_test = X_shuffle.iloc[dev_end:,:]
# y_test = Y_shuffle.iloc[dev_end:]

print("x_train: ",X_train)
print("y_train: ",y_train)

x_train:               1         2         3         4         5         6         7   \
29605 -0.385264  1.823136 -0.746522 -1.421827 -0.662783 -1.200023 -0.959670   
33044 -0.967556 -2.039817  1.484554  1.722469 -0.068113  0.959097  0.147228   
16594  0.828979  0.305381  1.395027  0.190674 -1.073875 -0.396534  0.315176   
21719 -0.575675 -0.580027  2.035389  1.193469  0.552159  2.675955 -1.723913   
24972 -1.037800  0.193346 -0.638358 -1.953306  0.872854  1.345356 -0.644967   
...         ...       ...       ...       ...       ...       ...       ...   
27517 -0.445843  0.355386  0.407306  0.625124 -1.090275  1.304770 -1.277503   
28392 -0.879290 -0.759885  0.327276 -0.162003 -1.348607 -1.533285  2.260646   
5776  -0.372228  1.408040 -0.398839 -0.147654  0.256481  2.135902 -0.034754   
24864 -0.041214 -0.775398 -0.621410 -0.898667  0.469864 -0.405432 -0.046723   
7756  -0.376301 -1.102319 -1.355764  1.032471 -0.091397  0.529688 -0.655466   

             8         9         10  ... 

In [45]:
#head() is a method specific to pandas DataFrames, not PyTorch tensors. PyTorch tensors do not have .head() or similar DataFrame methods, as they are primarily used for numerical operations and not for data manipulation or inspection.