In [61]:
import numpy as np
import pandas as pd
from collections import defaultdict
from scipy.stats import hmean
from scipy.spatial.distance import cdist
from scipy import stats
import numbers

# Exploratory Data Analysis

In [62]:
# df = pd.read_csv("data/Students_Performance_mv.csv")
df = pd.read_csv("data/WhatsgoodlyData-6.csv")
df.head()

Unnamed: 0,Question,Segment Type,Segment Description,Answer,Count,Percentage
0,What social platform has influenced your onlin...,Mobile,Global results,Facebook,548,0.205
1,What social platform has influenced your onlin...,Mobile,Global results,Instagram,916,0.342
2,What social platform has influenced your onlin...,Mobile,Global results,Snapchat,86,0.032
3,What social platform has influenced your onlin...,Mobile,Global results,Twitter,179,0.067
4,What social platform has influenced your onlin...,Mobile,Global results,,947,0.354


In [63]:
# Drop numerical columns
# df.drop(columns=['math score','reading score','writing score'], inplace=True)
df.drop(columns=['Count','Percentage'], inplace=True)

In [64]:
# Find the number of missing values per column.
df.isnull().sum()

Question               0
Segment Type           0
Segment Description    0
Answer                 0
dtype: int64

In [65]:
df.columns

Index(['Question', 'Segment Type', 'Segment Description', 'Answer'], dtype='object')

In [66]:
# Nullify some fields at random
df = df.stack().sample(frac=0.9).unstack()
df.head(10)

Unnamed: 0,Question,Segment Type,Segment Description,Answer
0,What social platform has influenced your onlin...,Mobile,Global results,
1,What social platform has influenced your onlin...,Mobile,Global results,Instagram
2,What social platform has influenced your onlin...,Mobile,Global results,Snapchat
3,What social platform has influenced your onlin...,,Global results,Twitter
4,What social platform has influenced your onlin...,Mobile,Global results,
5,What social platform has influenced your onlin...,Web,Web,Facebook
6,What social platform has influenced your onlin...,Web,,
7,What social platform has influenced your onlin...,Web,Web,Snapchat
8,What social platform has influenced your onlin...,Web,Web,Twitter
9,What social platform has influenced your onlin...,Web,Web,


In [67]:
df.isnull().sum()

Question               149
Segment Type           138
Segment Description    132
Answer                 161
dtype: int64

# Starting the Algorithm

In [68]:
def ExtractCompleteTuples(df):
    # getting the rows without null values
    CT = df.dropna()
    return CT   # CT.shape #(959, 5)

In [69]:
def ExtractInCompleteTuples(df):
    # getting only the rows with null values
    ICT = df[df.isnull().any(axis=1)]
    # print(ICT.shape) #(41, 5)
    return ICT.values

In [70]:
from math import log,e

# Entropy weight method (EWM)
def ComputeAttributeWeights(CT):
    n = CT.shape[0] # the number of rows in complete tuples
    s = CT.shape[1] # the number of columns

    # 1- Normalizing data(just numerocal cols)

    # 2-1 Calculating the entropy of each numerical attribute   
    
    # 2-2 Calculating the entropy of each categorical attribute 
    def entropy(labels, base=None):
        vc = pd.Series(labels).value_counts(normalize=True, sort=False)
        base = e if base is None else base
        return -(vc * np.log(vc)/np.log(base)).sum()

    E = []          # [0.6924027159890356, 1.5185039737243646, 1.71940544072419, 0.6502094546756849, 0.6508318554230292]
    for column in CT:
        E.append(entropy(CT[column], base=None))
    # 3- Determining the weight of each attribute
    w = [0] * s     # [-1.3295556932195063, 2.241176844063399, 3.1095515115596424, -1.5119314608568568, -1.5092412015466796]
    # TODO what is k?
    k = s
    sum = 0
    for i in range(k):
        sum += E[i]

    for i in range(s):
        w[i] = (1 - E[i]) / (k - sum)

    return w

In [71]:
def SortInCompleteTuples(ICT, r):
    # Convert list to npArray
    r = np.array(r)

    # Arg sort
    argSort = np.argsort(r) # it sorts r, and returns corresponding indexes
    
    # Create new empty npArray for sorted ICT
    sortedICT = np.copy(ICT)
    for index in range(argSort.size):
        sortedICT[index] = ICT[argSort[index]]
    
    return sortedICT

In [72]:
def Partition(seq, num):
    avg  = len(seq) / float(num)
    out  = []
    last = 0.0

    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

    return out

In [73]:
def GenerateTuplePartition(sortedICT, m):
    T = []
    T = Partition(sortedICT, m)
    return T

In [74]:
r = [1] * inCompleteRowsCount
def GenerateTuplePartitions(ICT, CT, m, s):

    W = ComputeAttributeWeights(CT)
    # STEP 1
    # Calculate tuple integrity rate, according to DEFINITION 5(example)
    inCompleteRowsCount = ICT.shape[0]      # the number of ICT rows                     
    for i in range(inCompleteRowsCount):
        for j in range(s):
            if pd.isnull(ICT[i][j]):
                r[i] = r[i] - W[j]    
    # TODO  r (-4.350728355623041, 2.511931460856857) ?      

    # STEP 2
    # sort ICT's tuples according to their integrity rate
    sortedICT = SortInCompleteTuples(ICT, r)
    
    # STEP 3
    tuplePartitions = GenerateTuplePartition(sortedICT, m)
    return tuplePartitions # a queue of subsets

In [75]:
# Defining a function which calculates euclidean distance between two data points(numerical)
def euclideanDistance(data1, data2, length):
    distance = 0
    for x in range(length):
        distance += np.square(data1[x] - data2[x])
    return np.sqrt(distance)

In [76]:
def distance_matrix(complete_set,incomplete_set, numeric_distance = "euclidean", categorical_distance = "hamming"):
    
    # Get the type of each attribute (Numeric or categorical)
    is_numeric = [all(isinstance(n, numbers.Number) for n in complete_set.iloc[:, i]) for i, x in enumerate(complete_set)]
    is_all_categorical = sum(is_numeric) == 0

    if categorical_distance == 'hamming':
        complete_set = pd.DataFrame([pd.factorize(complete_set[x])[0] for x in complete_set]).transpose()
        incomplete_set = pd.DataFrame([pd.factorize(incomplete_set[x])[0] for x in incomplete_set]).transpose()

    if is_all_categorical:
        if categorical_distance == "hamming":
            result_matrix = cdist(complete_set, incomplete_set, metric=categorical_distance)

    return pd.DataFrame(result_matrix)

In [77]:
def knn_impute(complete_set, incomplete_set, k_neighbors, aggregation_method="mode", numeric_distance="euclidean",
               categorical_distance="hamming"):
    
    numberOfICSamples=len(incomplete_set)
    target=[]
    inc_set=incomplete_set.copy()
    
    # Make sure the data are in the right format
    inc_set = pd.DataFrame(inc_set)
    complete_set = pd.DataFrame(complete_set)
    
    # Get the distance matrix and check whether no error was triggered when computing it
    distances = distance_matrix(complete_set,inc_set, numeric_distance, categorical_distance)

    # Get the closest points and compute the correct aggregation method
    for j in range(numberOfICSamples):
        for i, value in enumerate(inc_set.iloc[j, :]):
            if pd.isnull(value):
                order = distances.iloc[:,i].values.argsort()[:k_neighbors]
                closest_to_target = complete_set.iloc[order, i]
                inc_set.iloc[j,i]=stats.mode(closest_to_target)[0][0]
    
#     target=inc_set
    return inc_set

In [78]:
def KNNImputation(train_set, test_set):
    return knn_impute(train_set, test_set, k_neighbors=31)

In [79]:
def Merge(a, b):
    a=np.array(a)
    b=np.array(b)
    return np.concatenate((a, b), axis=0)

In [80]:
def Mean(a, b):
    # print(a)
    # print(b)
    pass

In [81]:
# Begin
CT  = ExtractCompleteTuples(df)   # this is dataframe
ICT = ExtractInCompleteTuples(df) # this is npArray

# The number of partitions
m = 5 # TODO ?
# The number of attributes
s = df.columns.size

T = GenerateTuplePartitions(ICT, CT, m, s)

CTS    = [[0]] * (m+1)
Tp     = [[0]] * (m+1)
CTS[0] = np.array(CT.copy())

for i in range(1, m+1):
    Tp[i-1] = KNNImputation(CTS[i-1], T[i-1])
    CTS[i]  = Merge(CTS[i-1], Tp[i-1])


# Do cross validation
Tpp = [[0]] * (m+1)
for i in range(0, m):
    Tpp[i] = KNNImputation(train_set=CTS[m], test_set=T[i])

Tpp[m] = KNNImputation(train_set=CTS[0], test_set=T[m-1])    

for i in range(1, m+1):
    CTS[i] = Merge(CTS[i-1],Tpp[i-1])

# print(CTS[m].shape) #(1450, 4)

# D' = CT[m] is our complete dataset.

# Calculate Evaluation metrics 

In [82]:
# convert CTS[m] to a dataFrame
df = pd.DataFrame(CTS[m], columns=['gender', 'race/ethnicity', 'parental level of education', 'lunch', 'test preparation course'])
# df = df.dropna()
df.info()

ValueError: Shape of passed values is (1450, 4), indices imply (1450, 5)

In [83]:
# Find the number of missing values per column.
df.isnull().sum()

Question               149
Segment Type           138
Segment Description    132
Answer                 161
dtype: int64

In [84]:
df['gender']                      = df['gender'].map({'male':0, 'female':1})
df['race/ethnicity']              = df['race/ethnicity'].map({'group A':0, 'group B':1, 'group C':2, 'group D':3, 'group E':4})
df['parental level of education'] = df['parental level of education'].map({"bachelor's degree":0, "high school":1, "some college":2, "master's degree":3, "associate's degree":4, "some high school":5})
df['lunch']                       = df['lunch'].map({'standard':0, 'free/reduced':1})
df['test preparation course']     = df['test preparation course'].map({'none':0, 'completed':1})

df.head()

KeyError: 'gender'

In [None]:
target = np.array(df['gender'])
data   = np.array(df.loc[:, df.columns != 'gender'])

In [85]:
from sklearn.model_selection import train_test_split

xTrain, xTest, yTrain, yTest = train_test_split(data, target, random_state=42)


print("shape of train data:   ", xTrain.shape)
print("shape of test  data:   ", xTest.shape)
print("shape of train target: ", yTest.shape)
print("shape of test  target: ", yTrain.shape)

NameError: name 'data' is not defined

In [86]:
import seaborn as sns
sns.countplot(df['gender'])

ModuleNotFoundError: No module named 'seaborn'

In [87]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(xTrain, yTrain)

yPred = knn.predict(xTest)
print(knn.score(xTrain, yTrain))
print(knn.score(xTest,  yTest))

NameError: name 'xTrain' is not defined

In [88]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(classification_report(yTest, yPred))

NameError: name 'yTest' is not defined

In [89]:
cm = confusion_matrix(yTest, yPred)
sns.heatmap(cm, square=True, annot=True)

NameError: name 'yTest' is not defined

## PFC metric

In [90]:
def PFC(NTrue, NFalse):
    return (NFalse/(NTrue+NFalse))    

In [91]:
def ExtractOriginalSet(df_with_NaN,df_without_NaN):
     return np.array(df_without_NaN[df_with_NaN.isnull().any(axis=1)])

In [92]:
def ExtractPredictSet():
    return CTS[m][len(CT):]

In [93]:
# Calculate the number of correct and incorrect predictions
def Calculate_NT_and_NF_of_prediction(originalSet,predictSet,NaNSet):
    NTrue=0
    NFalse=0
    rowsCount=len(NaNSet)
    columnsCount=len(NaNSet[0])
    
    for i in range(rowsCount):
        for j in range(columnsCount):
            if pd.isnull(NaNSet[i][j]):
                if(originalSet[i][j]==predictSet[i][j]):
                    NTrue+=1
                else:
                    NFalse+=1
    return {
        "NTrue":NTrue,
        "NFalse":NFalse
    }

In [94]:
def SortOriginalSet(originalSet):
    W = ComputeAttributeWeights(CT)

    inCompleteRowsCount = ICT.shape[0]     
    r = [1] * inCompleteRowsCount                      
    for i in range(inCompleteRowsCount):
        for j in range(s):
            if pd.isnull(ICT[i][j]):
                r[i] = r[i] - W[j]    
    
    sortedOriginalSet = SortInCompleteTuples(originalSet, r)   
    
    return sortedOriginalSet

In [95]:
def SortNaNSet():
    sortedNanSet=[0]* (len(ICT))
    index=0
    for i in range(len(T)):
        for j in range(len(T[i])):
            sortedNanSet[index]=T[i][j]
            index+=1
    return sortedNanSet

In [96]:
df2 = pd.read_csv("data/WhatsgoodlyData-6.csv")
df2.drop(columns=['Count','Percentage'], inplace=True)

originalSet=ExtractOriginalSet(df,df2)
originalSet=SortOriginalSet(originalSet)

predictSet=ExtractPredictSet()

NaNSet=SortNaNSet()

result=Calculate_NT_and_NF_of_prediction(originalSet,predictSet,NaNSet)
NTrue=result.get("NTrue")
NFalse=result.get("NFalse")

print(PFC(NTrue,NFalse))

0.5706896551724138
