In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing 
import pickle

In [None]:
df = pd.read_csv('data/adult.data')     
df.head() 

In [None]:
df = df.drop(['fnlwgt', 'educational-num'], axis = 1) 
   

## Fill Nan

In [None]:
col_names = df.columns 
for c in col_names: 
    df = df.replace("?", np.NaN) 
df = df.apply(lambda x:x.fillna(x.value_counts().index[0])) 

df.head()

Discretization – It is a common way to make categorical data more tidy and meaningful.  
We have applied discretization on column marital_status where they are narrowed down to only to values married or not married.   
Later, we will apply label encoder in the remaining data columns. Also, there are two redundant columns {‘education’, ‘educational-num’}, therefore, we have removed one of them.

## Replace 

In [None]:
df.replace(['Divorced', 'Married-AF-spouse', 'Married-civ-spouse', 'Married-spouse-absent', 
            'Never-married', 'Separated', 'Widowed'],
           ['divorced', 'married', 'married', 'married', 'not married', 
            'not married', 'not married'],inplace = True,regex=True)

df.head(10)

In [None]:
df3 = pd.DataFrame([0, 1, 2, 3, 4])

df3.replace([0, 1, 2, 3], [4, 4, 4, 10] ,inplace = True)

df3

## LabelEncoder()

In [None]:
le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])

In [None]:
list(le.classes_)

In [None]:
# gets the index value from the list classes
le.transform(["tokyo", "amsterdam", "paris","paris"])

In [None]:
# gets the class value from index
list(le.inverse_transform([2, 2, 1]))

In [None]:
# Above in one Step
le.fit_transform(["Thrissur", "Kozhikode", "Chennai", "Kochi"])

In [None]:
list(le.classes_)

## Rename Column

In [None]:
category_col =['workclass', 'race', 'education', 'marital-status', 'occupation', 
               'relationship', 'gender', 'native-country', 'income']  

In [None]:
df.rename(columns = {' State-gov':'workclass', 
          ' Bachelors':'education', 
          ' Never-married':'marital-status', 
          ' Adm-clerical':'occupation',
          ' Not-in-family':'relationship', 
          ' White':'race', 
          ' Male':'gender', 
          ' United-States':'native-country', 
          ' <=50K':'income'},inplace=True)

In [None]:
labelEncoder = preprocessing.LabelEncoder()
labelEncoder.fit_transform(category_col) 

In [None]:
df['race'][:5]

In [None]:
cv = df.dtypes.loc[df.dtypes == 'object'].index
print(cv)

In [None]:
labelEncoder = preprocessing.LabelEncoder() 
mapping_dict ={} 
for col in cv: 
    df[col] = labelEncoder.fit_transform(df[col]) 
  
    le_name_mapping = dict(zip(labelEncoder.classes_, 
                        labelEncoder.transform(labelEncoder.classes_))) 
  
    mapping_dict[col]= le_name_mapping 
print(mapping_dict) 

In [None]:

from sklearn.model_selection import train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 
  
X = df.values[:, 0:12] 
Y = df.values[:, 12] 


In [None]:
X[:3,]

In [None]:

X_train, X_test, y_train, y_test = train_test_split( 
           X, Y, test_size = 0.3, random_state = 100) 
  
dt_clf_gini = DecisionTreeClassifier(criterion = "gini", 
                                     random_state = 100, 
                                     max_depth = 6, 
                                     min_samples_leaf = 5) 
  
dt_clf_gini.fit(X_train, y_train) 
y_pred_gini = dt_clf_gini.predict(X_test) 
  
print ("Desicion Tree using Gini Index\nAccuracy is ", 
             accuracy_score(y_test, y_pred_gini)*100 ) 


## Flask Model 

In [None]:
pickle.dump(dt_clf_gini, open('model.pkl','wb'))

In [None]:
model = pickle.load(open('model.pkl','rb'))
# print(model.predict([[4, 300, 500]]))

## Script.py

In [None]:
import numpy as np
from flask import Flask, request, jsonify, render_template
import pickle

app = Flask(__name__)
model = pickle.load(open('model.pkl','rb'))

@app.route('/')
def home():
    return render_template('index.html')

# prediction function

def ValuePredictor(to_predict_list):
    to_predict = np.array(to_predict_list).reshape(1, 12)
    loaded_model = pickle.load(open('model.pkl','rb'))
    result = loaded_model.predict(to_predict)
    return result[0]

@app.route('/result', methods=['POST'])
def result():
        if request.method == 'POST':
                to_predict_list = request.form.to_dict()
                to_predict_list = list(to_predict_list.values())
                to_predict_list = list(map(int, to_predict_list))
                result = ValuePredictor(to_predict_list)
                if int(result) == 1:
                        prediction = 'Income more than 50K'
                else:
                        prediction = 'Income less that 50K'
                return render_template("result.html", prediction=prediction)


In [None]:
mkdir income-prediction
cd income-prediction
python3 -m venv venv
source venv/bin/activate
pip install flask numpy sklearn pandas
mkdir templates

In [None]:
export FLASK_APP=script.py
flask run

In [None]:
├── ./model.pkl
├── ./__pycache__
│   └──
├── ./script.py
├── ./templates
│   ├── ./templates/index.html
│   └── ./templates/result.html
└── ./venv
    ├── ./venv/bin
    ├── ./venv/include
    ├── ./venv/lib
    ├── ./venv/lib64 -> lib
    └── ./venv/pyvenv.cfg