In [None]:
import numpy as np
import pandas as pd
train = pd.read_csv('../input/bigmart-sales-data/Train.csv')  #Both files have been added directly from 'BigMart Sales Data' dataset
test = pd.read_csv('../input/bigmart-sales-data/Test.csv')
print(train.shape)
print(test.shape)

In [None]:
print(train.info())    #Checking datatypes and missing values 
print(test.info())

In [None]:
print(train.isna().sum())    #Checking  number of missing values for each column
print(test.isna().sum())

In [None]:
train.columns = train.columns.str.lower()    #Changing column names to lowercase so I don't have to worry about case
test.columns = test.columns.str.lower()

In [None]:
train.head()

In [None]:
#Imputing missing values using the fillna method
train.item_weight.fillna(train.item_weight.mean(), inplace = True)   #Imputing missing item_weight by mean item_weight (numerical data)   
train.outlet_size.fillna(train.outlet_size.mode()[0], inplace=True)  #Imputing missing outlet_size by its mode (categorical data)

In [None]:
test.item_weight.fillna(test.item_weight.mean(), inplace = True)
test.outlet_size.fillna(test.outlet_size.mode()[0], inplace=True)

In [None]:
print(train.isna().sum())    #Checking  number of missing values for each column
print(test.isna().sum())

In [None]:
train.columns

In [None]:
train.drop(train[['item_identifier','outlet_identifier','outlet_establishment_year']], inplace=True,  axis=1)
test.drop(test[['item_identifier','outlet_identifier','outlet_establishment_year']], inplace=True,  axis=1)

In [None]:
print(train.columns)
print(test.columns)

In [None]:
pd.get_dummies(train)
#item_fat_content column has duplicate values - LF, Low Fat and low fat; Regular and reg

In [None]:
train.item_fat_content.replace(['LF','low fat','reg'],['Low Fat','Low Fat','Regular'], inplace=True)   #Relpacing duplicate va;ues
test.item_fat_content.replace(['LF','low fat','reg'],['Low Fat','Low Fat','Regular'], inplace=True)

In [None]:
print(train.item_type.unique())                #Checking other duplicate values
print(train.outlet_size.unique()) 
print(train.outlet_location_type.unique())
print(train.outlet_type.unique())

In [None]:
train = pd.get_dummies(train)      #Dummifying categorical variables
test = pd.get_dummies(test)

In [None]:
train_x = train.drop(train[['item_outlet_sales']], axis=1)   #Separating dependent and independent variables of the train dataset
train_y = train['item_outlet_sales']

In [None]:
def scaler(data):      #Creating a MinMax Scaler function
    for col in data:
        data[col] = (data[col] - min(data[col])) / (max(data[col]) - min(data[col]))
    return data

In [None]:
train_x = scaler(train_x)      #Normalizing the train_x dataframe
test = scaler(test)       #Normalizing the test dataframe
train.head()

In [None]:
test.head()

In [None]:
train_array = np.array(train_x)
test_array = np.array(test)

In [None]:
#Calculating distance between test and train row using (train - test)^2 = train^2 + test^2 - 2*train*test
def euclidian_distance(trainset, testset):      
    m = trainset.shape[0]      #Assigning variables for number of rows of train_x and test_x datasets
    n = testset.shape[0]
    
    trainset_dots = (trainset * trainset).sum(axis=1).reshape(m,1) * np.ones(shape=(1,n))    
    
    testset_dots = (testset * testset).sum(axis=1) * np.ones(shape=(m,1))
    
    test_train_dots = -2 * trainset.dot(testset.T)
    
    distance = np.sqrt(trainset_dots + testset_dots + test_train_dots)
    
    return distance

In [None]:
distance_array = euclidian_distance(train_array, test_array)

In [None]:
distance_array.shape          #Shape should be equal to (number of rows of train, number of rows of test)

In [None]:
def Knn_regression(distance_data, k):
    nn_indices = np.argsort(distance_data, axis=0)[:k:]    #Getting the corresponding indices of the elements of sorted distance matrix in original distance matrix  
    nn_indices = pd.DataFrame(nn_indices)
    
    pred = pd.DataFrame(np.zeros((len(test),1)))                        #Creating a zero matrix of the same size as test_y
    
    for i, j in nn_indices.iteritems():             #Calculating sum of k-nearest neighbors and storing it in pred dataframe
        sum = 0.0
        for x in j:
            sum = sum + train_y.iloc[x]
        a = np.array(sum/k)
        pred.iloc[i,0] = a
    
    return pred

In [None]:
predicted = Knn_regression(distance_array, 9)
predicted.shape

In [None]:
predicted = pd.DataFrame(predicted)
predicted