Import Libraries

In [1]:
import pandas as pd
import numpy as np

Loading Dateset

In [2]:
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
class DecisionTree:
    class Question:
        def __init__(self, column,value):
            self.column = column
            self.value = value
            
    class Node:
        def __init__(self,question,trueNode,falseNode,leafNode,prediction):
            self.question = question
            self.leafNode = leafNode
            self.trueNode = trueNode
            self.falseNode = falseNode
            self.prediction = prediction

    def fit(self,x_train,y_train):
        data = x_train
        data["label"] = y_train
        gain, question = self.find_feature(data)
        leafNode=False
        predictions=None
        trueNode=None
        falseNode=None
        if gain==0:
            leafNode = True
            predictions = self.classCount(data)
        else:
            trueBranch,falseBranch = self.branchTree(question,data)
            trueNode = self.train(trueBranch)
            falseNode = self.train(falseBranch)
        self.rootNode = self.Node(question,trueNode,falseNode,leafNode,predictions)
        
    def train(self,data):
        gain, question=self.find_feature(data)
        leafNode=False
        predictions=None
        trueNode=None
        falseNode=None
        if gain==0:
            leafNode = True
            predictions = self.classCount(data)
        else:
            trueBranch,falseBranch=self.branchTree(question,data)
            trueNode = self.train(trueBranch)
            falseNode = self.train(falseBranch)
        return self.Node(question,trueNode,falseNode,leafNode,predictions)
    
    def classCount(self,data):
        probability = data.groupby("label")["label"].count().to_dict()
        for key in probability.keys():
            probability[key] = (probability[key]/len(data))
        return probability
    
    def gini(self,data):
        counts = self.classCount(data)
        impurity = 1
        for lbl in counts:
            prob_of_lbl = counts[lbl] / float(len(data))
            impurity -= prob_of_lbl**2
        return impurity
    
    def info_gain(self,left, right, Impurity):
        p = float(len(left)) / (len(left) + len(right))
        q = float(len(right)) / (len(left) + len(right))
        return Impurity - p * self.gini(left) - q * self.gini(right)
    
    def find_feature(self,data):
        gain = 0
        question = None
        current_uncertainty = self.gini(data)
        for col in data.drop("label",axis=1):
            values = data[col].unique()
            for val in values:
                q = self.Question(col,val)
                truenode,falsenode = self.branchTree(q,data)
                if len(truenode)==0 or len(falsenode)==0:
                    continue
                g = self.info_gain(truenode, falsenode, current_uncertainty)
                if g >= gain:
                    gain, question = g, q
        return gain,question
    
    def branchTree(self,question,data):
        truenode = data[data[question.column]==question.value]
        falsenode = data[data[question.column]!=question.value]
        return truenode,falsenode
        
    def predict(self,data,probability=False):
        if isinstance(data,pd.Series):
            data=data.to_frame().T
        result=[]
        for row in data.iterrows():
            row=row[1]
            node=self.rootNode
            while not node.leafNode:
                if row[node.question.column]==node.question.value:
                    node=node.trueNode
                else:
                    node=node.falseNode
            if probability:
                result.append(node.prediction)
            else:
                result.append(max(node.prediction, key=node.prediction.get))
        return result

Encoder

In [4]:
df = pd.read_csv("depression.csv")
df.head()

Unnamed: 0,Whichyear,feelingrightnow,ExpressFeeling,Gender,Age,Yourlocation,Relationshipstatus,Areyouhappyinancialy,succeededInEducationalinstitution,Understandingwithfamily,feelingpressureinyourstudy,satisfiedwithacademicresult,happywithlivingplace,supportsyouyouracademiclife,usedanysocialmedia,haveinferioritycomplex,satisfiedwithmeal,feelingSick/healt issues,donerecreationalactivitytoday,sleepAtNight
0,4th year,Very good,100,Male,22,Home,Single,Yes,5,Good,Yes,Yes,Yes,Family,Not applicable,No,Yes,Yes,Yes,8
1,4th year,Normal,51,Male,23,Home,Single,No,3,Normal,Yes,Yes,Yes,Family,Yes,Maybe,Neutral,No,Yes,8
2,4th year,Good,87,Male,23,Playground,Single,Yes,4,Normal,No,Yes,Yes,Family,Yes,No,Yes,No,Yes,7
3,4th year,Bad,37,Male,23,Home,Single,Yes,1,Normal,Yes,Yes,No,Family,Yes,Maybe,Neutral,Yes,Yes,5
4,4th year,Normal,56,Male,23,Hall-Mess,Single,No,4,Normal,Yes,No,No,Family,Yes,No,Neutral,No,No,7


In [5]:
Class_Status = df["feelingrightnow"]
df.drop(["feelingrightnow","ExpressFeeling","Whichyear"],axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,Gender,Age,Yourlocation,Relationshipstatus,Areyouhappyinancialy,succeededInEducationalinstitution,Understandingwithfamily,feelingpressureinyourstudy,satisfiedwithacademicresult,happywithlivingplace,supportsyouyouracademiclife,usedanysocialmedia,haveinferioritycomplex,satisfiedwithmeal,feelingSick/healt issues,donerecreationalactivitytoday,sleepAtNight
0,Male,22,Home,Single,Yes,5,Good,Yes,Yes,Yes,Family,Not applicable,No,Yes,Yes,Yes,8
1,Male,23,Home,Single,No,3,Normal,Yes,Yes,Yes,Family,Yes,Maybe,Neutral,No,Yes,8
2,Male,23,Playground,Single,Yes,4,Normal,No,Yes,Yes,Family,Yes,No,Yes,No,Yes,7
3,Male,23,Home,Single,Yes,1,Normal,Yes,Yes,No,Family,Yes,Maybe,Neutral,Yes,Yes,5
4,Male,23,Hall-Mess,Single,No,4,Normal,Yes,No,No,Family,Yes,No,Neutral,No,No,7


In [7]:
x_train, x_test, y_train, y_test = train_test_split(df, Class_Status, test_size= 0.4, random_state=1)

In [8]:
model = DecisionTree()
model.fit(x_train,y_train)
 
#y_pred = model.predict(x_train)
y_pred = model.predict(x_test)
 
 
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(metrics.classification_report(y_pred,y_test))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["label"] = y_train


              precision    recall  f1-score   support

         Bad       0.23      0.27      0.25        11
        Good       0.47      0.41      0.44        22
      Normal       0.85      0.94      0.89        18
    Very bad       0.53      0.43      0.47        21
   Very good       0.46      0.52      0.49        21

    accuracy                           0.53        93
   macro avg       0.51      0.52      0.51        93
weighted avg       0.53      0.53      0.52        93

