## Q2. For this problem, we will be working with the census income dataset from the UCI repository. Using this dataset,
## 1. Train a decision tree classification model using information gain as the splitting criterion and using only single feature decision stumps at all non-leaf nodes and majority votes at leaf nodes, and report its validation set performance using % accuracy (15 points)


In [13]:
column_names=['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country', 'income']

df = pd.read_csv(r"adult.data", index_col=False, names = column_names,   header = None)


#defining the columns and reading the file in a dataframe

In [35]:
df.info()

#feature scaling
for col in column_names:
    maxx = df[col].max()
    minn = df[col].min()
    
    
    df[col] = df[col].apply(lambda x: (x-minn)/(maxx-minn))
 
#removing noisy/unnecessary columns
for d in ['fnlwgt', 'capital-gain', 'capital-loss', 'education-num', 'income', 'age']:
    column_names.remove(d)

#df.drop(['fnlwgt', 'capital-gain', 'capital-loss', 'education-num'], axis=1, inplace=True)

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   age             32561 non-null  int64
 1   workclass       32561 non-null  int64
 2   fnlwgt          32561 non-null  int64
 3   education       32561 non-null  int64
 4   education-num   32561 non-null  int64
 5   marital-status  32561 non-null  int64
 6   occupation      32561 non-null  int64
 7   relationship    32561 non-null  int64
 8   race            32561 non-null  int64
 9   sex             32561 non-null  int64
 10  capital-gain    32561 non-null  int64
 11  capital-loss    32561 non-null  int64
 12  hours-per-week  32561 non-null  int64
 13  native-country  32561 non-null  int64
 14  income          32561 non-null  int64
dtypes: int64(15)
memory usage: 3.7 MB


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.013889,0.142857,4.6e-05,0.0,0.0,0.166667,0.076923,0.2,0.0,0.0,0.008475,0.0,0.010753,0.0,0.0
2,0.027778,0.285714,9.2e-05,0.066667,0.066667,0.333333,0.153846,0.0,0.0,0.0,0.008475,0.0,0.0,0.0,0.0
3,0.041667,0.285714,0.000139,0.133333,0.133333,0.166667,0.153846,0.2,0.25,0.0,0.008475,0.0,0.0,0.0,0.0
4,0.055556,0.285714,0.000185,0.0,0.0,0.166667,0.230769,0.4,0.25,1.0,0.008475,0.0,0.0,0.025,0.0


In [14]:
df.head()

#cleaning
df = df.replace(' ?',np.nan)
df.isna().sum()

df.fillna(df.mode().loc[0], inplace=True)
df.isna().sum()

for col in column_names:
  df[col], tmp  = pd.Series(list(df[col])).factorize()

In [36]:

def ent(col):
    counts = np.unique(col,return_counts=True)
    ent = 0.0
    for ix in counts[1]:
        p = ix/col.shape[0]
        ent += (-1.0*p*np.log2(p))
    return ent

def splitd(x_data,fkey,fval):
    x_right = pd.DataFrame([],columns=x_data.columns)
    x_left = pd.DataFrame([],columns=x_data.columns)
    for i in range(x_data.shape[0]):
        val = x_data[fkey].loc[i]
        if val >=fval:
            x_right = x_right.append(x_data.iloc[i])
        else:
            x_left = x_left.append(x_data.iloc[i])
    return x_right,x_left

def ig(x_data,fkey,fval):
    right,left = splitd(x_data,fkey,fval)
    
    l = float(left.shape[0])/x_data.shape[0]
    r = float(right.shape[0])/x_data.shape[0]
    if left.shape[0] == 0 or right.shape[0] == 0:
        return -99999
    i_gain = ent(x_data.income) - (l * ent(left.income) + r*ent(right.income))
    return i_gain

class DecisionTree:
    def __init__(self,depth=0,max_depth=10):
        self.left = None
        self.right = None
        self.fkey = None
        self.fval = None
        self.depth = depth
        self.max_depth = max_depth
        self.target = None
    
    def train(self,x_train):
        features=column_names
        info_gains = []
        for ix in features:
            i_gain = ig(x_train,ix,x_train[ix].mean())
            info_gains.append(i_gain)
        self.fkey = features[np.argmax(info_gains)]
        self.fval = x_train[self.fkey].mean()
        print("Splitting Tree",self.fkey)
        data_right,data_left = splitd(x_train,self.fkey,self.fval)
        data_right = data_right.reset_index(drop=True)
        data_left = data_left.reset_index(drop=True)
        if data_left.shape[0] == 0 or data_right.shape[0] == 0:
            if x_train.income.mean() >= 0.5:
                self.target = "Positive"
            else:
                self.target = "Negative"
            return
        if self.depth >= self.max_depth:
            if x_train.income.mean() >= 0.5:
                self.target = "Positive"
            else:
                self.target = "Negative"
            return
        self.left = DecisionTree(self.depth+1,self.max_depth)
        self.left.train(data_left)
        self.right = DecisionTree(self.depth+1,self.max_depth)
        self.right.train(data_right)
        if x_train.income.mean() >= 0.5:
            self.target = "Positive"
        else:
            self.target = "Negative"
        return
    def predict(self,test):
        if test[self.fkey] > self.fval:
            if self.right is None:
                return self.target
            return self.right.predict(test)
        if test[self.fkey] <= self.fval:
            if self.left is None:
                return self.target
            return self.left.predict(test)

In [128]:
split = int(0.7*df.shape[0])
train_data = df[:split]
test_data = df[split:]
test_data= test_data.reset_index(drop=True)


test_data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.138889,0.285714,0.597635,0.000000,0.000000,0.166667,0.076923,0.2,0.00,0.0,0.008475,0.000000,0.172043,0.0,0.0
1,0.000000,0.285714,0.766573,0.066667,0.066667,0.166667,0.538462,0.2,0.25,0.0,0.008475,0.054945,0.000000,0.0,1.0
2,0.291667,0.285714,0.766619,0.333333,0.333333,0.000000,0.384615,0.6,0.00,1.0,0.008475,0.000000,0.096774,0.0,0.0
3,0.208333,0.285714,0.766665,0.066667,0.066667,0.500000,0.230769,0.0,0.00,0.0,0.008475,0.000000,0.000000,0.0,0.0
4,0.388889,0.142857,0.766711,0.066667,0.066667,0.166667,0.307692,0.2,0.00,0.0,0.008475,0.000000,0.086022,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9764,0.513889,0.285714,1.000000,0.400000,0.400000,0.166667,0.769231,0.4,0.00,1.0,0.008475,0.000000,0.150538,0.0,0.0
9765,0.180556,0.285714,0.029057,0.066667,0.066667,0.166667,0.692308,0.2,0.00,0.0,0.008475,0.000000,0.000000,0.0,1.0
9766,0.694444,0.285714,0.901141,0.066667,0.066667,1.000000,0.000000,0.8,0.00,1.0,0.008475,0.000000,0.000000,0.0,0.0
9767,0.333333,0.285714,0.826350,0.066667,0.066667,0.000000,0.000000,0.6,0.00,0.0,0.008475,0.000000,0.096774,0.0,0.0


In [129]:
dt = DecisionTree()


In [130]:
split = int(0.5*test_data.shape[0])
print(split)
valid_data = test_data[:split]
print(valid_data)
test_data = test_data[split:]
print(test_data)

4884
           age  workclass    fnlwgt  education  education-num  marital-status  \
0     0.138889   0.285714  0.597635   0.000000       0.000000        0.166667   
1     0.000000   0.285714  0.766573   0.066667       0.066667        0.166667   
2     0.291667   0.285714  0.766619   0.333333       0.333333        0.000000   
3     0.208333   0.285714  0.766665   0.066667       0.066667        0.500000   
4     0.388889   0.142857  0.766711   0.066667       0.066667        0.166667   
...        ...        ...       ...        ...            ...             ...   
4879  0.138889   0.571429  0.027302   0.000000       0.000000        0.000000   
4880  0.319444   0.285714  0.292096   0.000000       0.000000        0.000000   
4881  0.333333   0.285714  0.329237   0.466667       0.466667        0.000000   
4882  0.083333   0.285714  0.004435   0.333333       0.333333        0.166667   
4883  0.000000   0.285714  0.768513   0.400000       0.400000        1.000000   

      occupation  rela

In [131]:
test_data  = test_data.reset_index(drop=True)
valid_data  = valid_data.reset_index(drop=True)


In [124]:
valid_data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.138889,0.285714,0.597635,0.000000,0.000000,0.166667,0.076923,0.2,0.00,0.0,0.008475,0.000000,0.172043,0.0,0.0
1,0.000000,0.285714,0.766573,0.066667,0.066667,0.166667,0.538462,0.2,0.25,0.0,0.008475,0.054945,0.000000,0.0,1.0
2,0.291667,0.285714,0.766619,0.333333,0.333333,0.000000,0.384615,0.6,0.00,1.0,0.008475,0.000000,0.096774,0.0,0.0
3,0.208333,0.285714,0.766665,0.066667,0.066667,0.500000,0.230769,0.0,0.00,0.0,0.008475,0.000000,0.000000,0.0,0.0
4,0.388889,0.142857,0.766711,0.066667,0.066667,0.166667,0.307692,0.2,0.00,0.0,0.008475,0.000000,0.086022,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4879,0.138889,0.571429,0.027302,0.000000,0.000000,0.000000,0.076923,0.6,0.00,0.0,0.008475,0.000000,0.000000,0.0,0.0
4880,0.319444,0.285714,0.292096,0.000000,0.000000,0.000000,0.461538,0.0,0.00,0.0,0.008475,0.000000,0.000000,0.0,0.0
4881,0.333333,0.285714,0.329237,0.466667,0.466667,0.000000,0.000000,0.6,0.00,1.0,0.008475,0.000000,0.000000,0.0,0.0
4882,0.083333,0.285714,0.004435,0.333333,0.333333,0.166667,0.307692,0.2,0.00,0.0,0.008475,0.000000,0.000000,0.0,0.0


In [125]:
test_data

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,0.000000,0.285714,0.888530,0.066667,0.066667,0.166667,0.230769,0.2,0.25,0.0,0.008475,0.0,0.000000,0.0,0.0
1,0.194444,0.285714,0.888576,0.333333,0.333333,0.000000,0.000000,1.0,0.00,1.0,0.008475,0.0,0.000000,0.0,0.0
2,0.097222,0.571429,0.664711,0.333333,0.333333,0.166667,0.076923,0.2,0.00,0.0,0.008475,0.0,0.000000,0.0,1.0
3,0.375000,0.285714,0.888622,0.466667,0.466667,0.000000,0.000000,0.0,0.00,1.0,0.008475,0.0,0.000000,0.0,0.0
4,0.027778,0.285714,0.888668,0.000000,0.000000,0.166667,0.230769,0.2,0.00,0.0,0.008475,0.0,0.032258,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4880,0.513889,0.285714,1.000000,0.400000,0.400000,0.166667,0.769231,0.4,0.00,1.0,0.008475,0.0,0.150538,0.0,0.0
4881,0.180556,0.285714,0.029057,0.066667,0.066667,0.166667,0.692308,0.2,0.00,0.0,0.008475,0.0,0.000000,0.0,1.0
4882,0.694444,0.285714,0.901141,0.066667,0.066667,1.000000,0.000000,0.8,0.00,1.0,0.008475,0.0,0.000000,0.0,0.0
4883,0.333333,0.285714,0.826350,0.066667,0.066667,0.000000,0.000000,0.6,0.00,0.0,0.008475,0.0,0.096774,0.0,0.0


In [132]:
dt.train(train_data[:5000])

Splitting Tree relationship
Splitting Tree relationship
Splitting Tree occupation
Splitting Tree workclass
Splitting Tree hours-per-week
Splitting Tree marital-status
Splitting Tree education
Splitting Tree education
Splitting Tree workclass
Splitting Tree workclass
Splitting Tree occupation
Splitting Tree occupation
Splitting Tree occupation
Splitting Tree sex
Splitting Tree occupation
Splitting Tree workclass
Splitting Tree workclass
Splitting Tree occupation
Splitting Tree occupation
Splitting Tree education
Splitting Tree occupation
Splitting Tree occupation
Splitting Tree race
Splitting Tree native-country
Splitting Tree workclass
Splitting Tree education
Splitting Tree hours-per-week
Splitting Tree workclass
Splitting Tree workclass
Splitting Tree education
Splitting Tree workclass
Splitting Tree workclass
Splitting Tree education
Splitting Tree workclass
Splitting Tree education
Splitting Tree education
Splitting Tree education
Splitting Tree education
Splitting Tree hours-per-w

In [136]:
train_data = train_data[:5000]
dt

<__main__.DecisionTree at 0x7feae129c460>

In [137]:
y_pred = []
for ix in range(valid_data.shape[0]):
    y_pred.append(dt.predict(valid_data.loc[ix]))

y_pred[:10]

for i in range(len(y_pred)):
    if y_pred[i] == "Negative":
        y_pred[i] = 0
    else:
        y_pred[i] = 1

print("Depth:10; Accuracy is", str(round(np.mean(y_pred == valid_data['income'])*100,2)), '%')

Depth:10; Accuracy is 80.0 %


## 2. Use cross-validation to optimize the tree hyperparameters (10 points)
### > __3-fold CV__ : I have already used a maxDepth of 10 with the 9 features above, and obtained an accuracy of 80% on validation data, now I shall be using maxDepths of 8, 9 and see how it works:


In [138]:
mds = [8,9]
for md in mds:
    dt = DecisionTree(max_depth=md)
    dt.train(train_data)
    y_pred = []
    for ix in range(valid_data.shape[0]):
        y_pred.append(dt.predict(valid_data.loc[ix]))

    y_pred[:10]

    for i in range(len(y_pred)):
        if y_pred[i] == "Negative":
            y_pred[i] = 0
        else:
            y_pred[i] = 1

    print("Depth:",str(md), ";", "Accuracy is", str(round(np.mean(y_pred == valid_data['income'])*100,2)), '%')
    
    
    

Splitting Tree relationship
Splitting Tree relationship
Splitting Tree occupation
Splitting Tree workclass
Splitting Tree hours-per-week
Splitting Tree marital-status
Splitting Tree education
Splitting Tree education
Splitting Tree workclass
Splitting Tree workclass
Splitting Tree race
Splitting Tree native-country
Splitting Tree workclass
Splitting Tree education
Splitting Tree hours-per-week
Splitting Tree workclass
Splitting Tree hours-per-week
Splitting Tree race
Splitting Tree sex
Splitting Tree workclass
Splitting Tree marital-status
Splitting Tree education
Splitting Tree workclass
Splitting Tree education
Splitting Tree race
Splitting Tree education
Splitting Tree workclass
Splitting Tree education
Splitting Tree sex
Splitting Tree education
Splitting Tree marital-status
Splitting Tree workclass
Splitting Tree workclass
Splitting Tree workclass
Splitting Tree education
Splitting Tree education
Splitting Tree education
Splitting Tree marital-status
Splitting Tree occupation
Spli

### - Depth: 8 ; Accuracy is 81.02 %
### - Depth: 9 ; Accuracy is 79.98 %
### - Depth:10; Accuracy is 80.0 %

 # Hence, we come to a conclusion that maxDepth of 8 is the most optimal

In [None]:
dt = DecisionTree(max_depth=8)
dt.train(train_data)


In [146]:
y_pred = []
for ix in range(test_data.shape[0]):
    y_pred.append(dt.predict(test_data.loc[ix]))

y_pred[:10]



['Positive',
 'Negative',
 'Positive',
 'Negative',
 'Positive',
 'Negative',
 'Negative',
 'Negative',
 'Negative',
 'Negative']

In [147]:
for i in range(len(y_pred)):
    if y_pred[i] == "Negative":
        y_pred[i] = 0
    else:
        y_pred[i] = 1

print("Depth:",str(8), ";", "Accuracy is", str(round(np.mean(y_pred == test_data['income'])*100,2)), '%')

Depth: 8 ; Accuracy is 80.51 %


## So I am getting an accuracy of 80.51% on test data with a value of maxDepth as 8, as obtained after cross validation