In [56]:
import pandas as pd
import numpy as np
from scipy import stats

# To read the csv file to get our data, with no column headings
df = pd.read_csv('Indian Liver Patient Dataset (ILPD).csv',header = None)

# To add column headings to the dataframe
col = ['Age','Gender','TB','DB','Alkphos','Sgpt','Sgot','TP','ALB','A/G','Selector']
df.columns = col

# Changes values of gender column to integers 0 and 1
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})

# Changes selector values to 0 and 1
df['Selector'] = df['Selector'].map({1: 0, 2: 1})

# Seperating both classes
df_0 = df.loc[df['Selector'] == 0]
df_1 = df.loc[df['Selector'] == 1]

#Filling null values with mean in class with selector 0
for i in df_0.columns:
    if df_0[i].isnull().values.any():
        mean = df_0[i].mean()
        df_0[i] = df_0[i].fillna(mean)
        
#Filling null values with mean in class with selector 1        
for i in df_1.columns:
    if df_1[i].isnull().values.any():
        mean = df_1[i].mean()
        df_1[i] = df_1[i].fillna(mean)
        
# Changing dataframe to dataframe with no null values        
df = pd.concat([df_0,df_1], ignore_index=True)

# Using z value to remove outliers
z = np.abs(stats.zscore(df))
df = df[(z < 2).all(axis=1)]


df_0 = df.loc[df['Selector'] == 0]
df_1 = df.loc[df['Selector'] == 1]

# Increasing class size of selector 1 to create equally balanced dataset
count = len(df_0)
while(len(df_1) <= count - 20):
    df2 = df_1.sample(n=60)
    df_1 = pd.concat([df_1, df2], ignore_index=True)

df = pd.concat([df_0,df_1], ignore_index=True)

# Increasing size of dataset by 5 times
count = len(df)
while(len(df) <= count * 5):
    df2 = df.sample(100)
    df = pd.concat([df,df2], ignore_index=True)
df_0 = df.loc[df['Selector'] == 0]
df_1 = df.loc[df['Selector'] == 1]

print(len(df_0))
print(len(df_1))

df = df.sample(frac=1)

print(df)

df.to_csv('ILPD.csv')

1624
1607
      Age  Gender   TB   DB  Alkphos  Sgpt  Sgot   TP  ALB   A/G  Selector
2149   60       1  3.2  1.8      750    79   145  7.8  3.2  0.69         0
2587   65       1  4.9  2.7      190    33    71  7.1  2.9  0.70         0
1800   74       0  0.9  0.3      234    16    19  7.9  4.0  1.00         0
1719   50       1  0.7  0.2      206    18    17  8.4  4.2  1.00         1
2847   60       1  0.6  0.1      186    20    21  6.2  3.3  1.10         1
...   ...     ...  ...  ...      ...   ...   ...  ...  ...   ...       ...
1009   45       1  2.4  1.1      168    33    50  5.1  2.6  1.00         0
1248   22       1  2.7  1.0      160    82   127  5.5  3.1  1.20         1
2419   21       1  0.7  0.2      211    14    23  7.3  4.1  1.20         1
649    60       1  2.9  1.3      230    32    44  5.6  2.0  0.50         0
726    22       1  0.9  0.3      179    18    21  6.7  3.7  1.20         1

[3231 rows x 11 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
