### Data Cleaning from Scratch and Visualization of Cleanind Data

 # Data Cleaning Operation
 
 1. Treatment Of Missing Values
 2. Smoothing of noisy Data
 3. Data Transformation

#### **Source of the database**: https://sci2s.ugr.es/keel/dataset/data/classification/pima-10-fold.zip

   From National Institute of Diabetes and Digestive and Kidney Diseases. Several constraints 
were placed on the selection of these instances from a larger database. In particular, all patients 
here are females at least 21 years old of Pima Indian heritage. 

   The class label represents if the person has not diabetes (tested_negative) or the person 
has diabetes (tested_positive). 

#### Attribute information: 
1. Preg = Number of times pregnant 
2. Plas = Plasma glucose concentration a 2 hours in an oral glucose tolerance test 
3. Pres = Diastolic blood pressure (mm Hg) 
4. Skin = Triceps skin fold thickness (mm) 
5. Insu = 2-Hour serum insulin (mu U/ml) 
6. Mass = Body mass index (weight in kg/(height in m)^2) 
7. Pedi = Diabetes pedigree function 
8. Age = Age (years)


#### Missing Values: 50.65%

#### The program can be divided into 4 parts:
1. Reading the CSV file
2. Find the missing values and replaces them. (by any one of following methods).
        a. Fill the missing value manually
        b. Use a Global Constant
        c. Use mean to fill the value
        d. Use Median to fill the value
3. Divide key values into bins and smoothing them. (by any one of the following methods)
        a. Smooth by Bin Means
        b. Smooth by Bin Boundaries
4. Apply the normalization process. (by any of the following methods)
        a. Min-max Normalization
        b. Z-score normalization
        c. Decimal Scaling

In [None]:
## Import Support libraries
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import array
import math

In [None]:
## Reading the CSV File Downloaded from the source
df = pd.read_csv("../input/missing-values-pima-indians-diabetes-data/pima_Missing_values.csv")
print(df.head())

## Data preprossessing ----
### converting categorical column (output column) into nominal values

In [None]:
for i in range(0, 690):
    if(df.Class[i] == 'tested_positive'):
        df.Class[i] = 1
    if(df.Class[i] == 'tested_negative'):
        df.Class[i] = 0
df.Class = df.Class.astype(int)

### replacing "< null >" values as '-999' (to numeric)

In [None]:
sum = [0] * 8
for i in range(0,8):
    for j in range(0, 690):
        if(df.iloc[j, i] == '<null>'):
            df.iloc[j, i] = -999
            sum[i] = sum[i] + 1

### converting object datatype into float datatype 

In [None]:
for i in range(0, 8):
    df.iloc[:, i] = df.iloc[:, i].astype(float)

## Global Constants ----

In [None]:
GlobConst = [3, 110, 105, 18, 20.0, 26.5, 0.5, 30]

print("Global Constants for given database is: ")
for i in range(0, 8):
	print(df.columns[i], " = ", GlobConst[i])

## --------------------Filling the missing values-------

### Using column Global Constant

In [None]:
def useGlobalConstant():
	for i in range(0, 8):
		for j in range(0, 690):
			if df.iloc[j, i] == -999:
				df.iloc[j, i] = GlobConst[i]
	for i in range(0, 8):
		for j in range(0, 690):
			if df.iloc[j, i] == -999:
				print('Error')
	print(df.head())

### Using column Mean

In [None]:
def useMean():
	for i in range(0, 8):
		count = 0
		summ = 0
		for j in range(0, 690):
			if df.iloc[j, i] != -999:
				summ = summ + df.iloc[j, i]
				count = count + 1
		avg = summ / count
		avg = round(avg, 3)
		print("col ", i, " Average = ", avg, " total missing = ", (690-count))
		for j in range(0, 690):
			if df.iloc[j, i] == -999:
				df.iloc[j, i] = avg
	print(df.head())

### Using column Median

In [None]:
def useMedian():
	for i in range(0, 8):
		count = 0
		sr = df.iloc[:, i][df.iloc[:, i] != -999]
		# print(type(sr), "  ", len(sr))
		median = sr.median()
		print("col ", i, " Median = ", median, " total missing = ", (690 - len(sr)))
		for j in range(0, 690):
			if df.iloc[j, i] == -999:
				df.iloc[j, i] = median
	print(df.head())

In [None]:
print("\n\nFilling the missing values choice: ")
print("\n 1. Use a Global Constant\n 2. Use mean to fill the value\n 3. Use Median to fill the value\n")
c = input("Choice = ")
print(c)
if c == '1':
	useGlobalConstant()
elif c == '2':
    useMean()
elif c == '3':
    useMedian()
else:
    print("Invalid Input.")

## --------------------Smooting data by binning------------

In [None]:
df_copy = df.iloc[:, 0:8].copy()

### Binning by frequency

In [None]:
def binningByFreqency(sr):
	noOfBins = (int)(math.sqrt(690))
	sr = sr.sort_values()
	sort_index = np.array(sr.index)
	rang = 690
	binwidth = (int)(rang / noOfBins)
	bins = [[0 for i in range(binwidth)] for j in range(noOfBins)]
	return(bins, sort_index, noOfBins, binwidth)

## Smoothing:

### Smoothing by bin means

In [None]:
def smoothByBinMeans():
	for i in range(0, 8):
		sr = df.iloc[:, i]
		sr1 = [0 for i in range(0, 690)]
		(bins, ind, noOfBins, binwidth) = binningByFreqency(sr)
		n=0
		for m in range(0, noOfBins):
			bins[m] = sr[n : (n + binwidth)]
			if(i == 5):
				binMean = round(bins[m].mean(), 1)
			elif(i == 6):
				binMean = round(bins[m].mean(), 3)
			else:
				binMean = (int)(bins[m].mean())
			bins[m] = [binMean for k in range(binwidth)]
			n = n + binwidth
		bins = np.array(bins).flatten()
		# print(bins)
		for j in range(0, len(bins)):
			sr1[ind[j]] = bins[j]
		df_copy.iloc[:, i] = sr1
	print("CSV named \"BinnedDataPimaIndianDiabetes.csv\" is generated after binning.")
	df_copy.to_csv('BinnedDataPimaIndianDiabetes.csv')

### Smoothing by bin boundries

In [None]:
def smoothByBinBoundaries():
	for i in range(0, 8):
		sr = df.iloc[:, i]
		sr1 = [0 for i in range(0, 690)]
		(bins, ind, noOfBins, binwidth) = binningByFreqency(sr)
		n=0
		for m in range(0, noOfBins):
			bins[m] = sr[n : (n + binwidth)]
			fro = bins[m][0]
			bck = bins[m][binwidth-1]
			k=0
			for l in bins[m]:
				x1 = l - fro
				x2 = bck - l
				if(x1 < x2):
					bins[m][k] = fro
				else:
					bins[m][k] = bck
				k = k + 1
			n = n + binwidth
			# print(bins[m])
		bins = np.array(bins).flatten()
		for j in range(0, len(bins)):
			sr1[ind[j]] = bins[j]
		df_copy.iloc[:, i] = sr1
	print("CSV named \"BinnedDataPimaIndianDiabetes.csv\" is generated after binning.")
	df_copy.to_csv('BinnedDataPimaIndianDiabetes.csv')

In [None]:
print("\n\nSmoothing by Binning methods: ")
print("\n1. Smooth by Bin Means\n2. Smooth by Bin Boundaries")
c = input("Choice = ")
print(c)
if c == '1':
	smoothByBinMeans()
elif c == '2':
	smoothByBinBoundaries()
else:
	print("Invalid Input.")

## -------------------Normalization --------------------------

In [None]:
df_copy2 = df.iloc[:, 0:8].copy()

### min max normalization

In [None]:
def MinMaxNorm():
	for i in range(0, 8):
		sr = df.iloc[:, i]
		sr1 = []
		mina = min(sr)
		maxa = max(sr)
		print("New min value for ", df.columns[i])
		new_mina = round((float)(input()), 2)
		print("New max value for ", df.columns[i])
		new_maxa = round((float)(input()), 2)
		for j in sr:
			sr1 = sr1 + [((j - mina) / (maxa - mina)) * (new_maxa - new_mina) + new_mina]
		df_copy2.iloc[:, i] = sr1
	print("\nCSV named \"NormalizedDataPimaIndianDiabetes.csv\" is generated with normalized values.")
	df_copy2.to_csv('NormalizedDataPimaIndianDiabetes.csv')

### z-score normalization

In [None]:
def ZScoreNorm():
	for i in range(0, 8):
		sr = df.iloc[:, i]
		sr1=[]
		mean_sr = sr.mean()
		std_sr = sr.std()
		for j in sr:
			sr1 = sr1 + [(j - mean_sr) / std_sr]
		df_copy2.iloc[:, i] = sr1
	print("\nCSV named \"NormalizedDataPimaIndianDiabetes.csv\" is generated with normalized values.")
	df_copy2.to_csv('NormalizedDataPimaIndianDiabetes.csv')

### decimal scaling

In [None]:
def DecimalScal():	
	for i in range(0, 8):
		sr = df.iloc[:, i]
		maxa = max(sr)
		d = 1
		sr1 = []
		while(maxa > 0):
			maxa = (int)(maxa / 10)
			d = d * 10
		for j in sr:
			sr1 = sr1 + [j / d]
		df_copy2.iloc[:, i] = sr1
	print("\nCSV named \"NormalizedDataPimaIndianDiabetes.csv\" is generated with normalized values.")
	df_copy2.to_csv('NormalizedDataPimaIndianDiabetes.csv')

In [None]:
print("\n\nNormalization: ")
print("\n1. Min-Max Normalization\n2. Z-Score Normalization\n3. Decimal Scaling")
c = input("Choice = ")
print(c)
if c == '1':
	MinMaxNorm()
elif c == '2':
	ZScoreNorm()
elif c == '3':
	DecimalScal()
else:
	print("Invalid Input.")

# Visualization of data:

In [None]:
df2 = pd.read_csv("./NormalizedDataPimaIndianDiabetes.csv")
print(df2.head())

## 1. Correlation Matrix:

In [None]:
plt.figure(figsize=(10, 12))
sns.set(rc={'figure.figsize':(12, 6)})
sns.heatmap(df2.corr(), annot = True)

## 2. Univariate Analysis

#### Target variable -> “Class” (0 = non-diabetic, 1 = diabetic)

In [None]:
df2['Class'] = df.iloc[:, 8]
sns.countplot(x="Class", data=df2)

#### All the columns are numerical, so we will plot the histograms of it. From it, we can view the distribution of the data.

In [None]:
df2.head()

In [None]:
num_columns = df.select_dtypes(exclude='object').columns.tolist()
num_columns.remove('Class')
print(num_columns)

plt.figure(figsize=(16,40))
for i,col in enumerate(num_columns,1):
    plt.subplot(8,4,i)
    sns.kdeplot(df[col],shade=True)
plt.show()

#### By using boxplots, we can analyse the outliers and inter-quartile range of data.

In [None]:
plt.figure(figsize=(16,40))
for i,col in enumerate(num_columns,1):
    plt.subplot(8,4,i)
    df[col].plot.box()
plt.show()

From the measures of below skewness and kurtosis value we can identify which column might have outliers.

We can see that “BloodPressure” and “DiabetesPedigree” has the kurtosis value >3,it means they have outlier values which can be normalized using some processing.

In [None]:
num_data = df[num_columns]
pd.DataFrame(data=[num_data.skew(),num_data.kurtosis()],index=['skewness','kurtosis'])

## 3. Bivariate Analysis

### With target variable

In [None]:
plt.figure(figsize=(20,40))

for i,col in enumerate(num_columns,1):
    plt.subplot(10,1,i)
    if col in ['X','Y']:
        sns.swarmplot(data=df,x=col,y='Class')
    else:
        sns.scatterplot(data=df,x=col,y='Class')
plt.show()

## 4. Multivariate Analysis

In [None]:
selected_features = df.columns
print(selected_features)

sns.pairplot(df,hue='Class',vars=selected_features)
plt.show()