In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings("ignore")

In [43]:
pwd

'c:\\Users\\rahul\\Desktop\\Group Project\\Repo\\MyUniLifeSimulation\\SRC\\Recommender System src\\Recommender System Backup'

In [44]:
df = pd.read_csv('recommender_system_data.csv',encoding='cp1252')
df.head()

Unnamed: 0,University,Course_Name,CAO Score,Budget,City,Interest,Job domain
0,University College Dublin,Biomedical Engineering (DN150),556,6000,Dublin,Cricket,IT
1,University College Dublin,Chem & Bioprocess Engineering (NQS2),528,5000,Dublin,Football,HR
2,University College Dublin,Civil Engineering (DN150),551,4500,Dublin,Singing,Management
3,University College Dublin,Computer Science (DN201),542,3500,Dublin,Chess,Support
4,University College Dublin,Electrical/Electronics Engineering (DN150),555,6500,Dublin,Athletics,Finance


In [45]:
df['University']=df['University'].str.strip()
df['Course_Name']=df['Course_Name'].str.strip()
df=df.drop(['University'], axis=1)
df.rename(columns = {'CAO Score':'CAO_Score'}, inplace = True)
df.rename(columns = {'Job domain':'Job_domain'}, inplace = True)

In [46]:
def CAO_SCORE(score):
    if  (score >= 50)  and (score <250 ) :
        return "Low"
    elif  (score >= 250)  and (score <500 ) :
        return "Medium"
    elif (score >= 500):
        return "High"
    #else: return 'Neutral'

df['CAO_Score'] = df['CAO_Score'].apply(CAO_SCORE)

In [47]:
def Budget(Budget):
    if  (Budget >= 2000)  and (Budget <5000 ) :
        return "Low"
    elif  (Budget >= 5000)  and (Budget <7000 ) :
        return "Medium"
    elif (Budget >= 7000):
        return "High"
    #else: return 'Neutral'

df['Budget'] = df['Budget'].apply(Budget)

In [48]:
df.tail()

Unnamed: 0,Course_Name,CAO_Score,Budget,City,Interest,Job_domain
4803,Education Mathematics & Business Studies - Thu...,Medium,Low,Tipperary,Cricket,Support
4804,Education Mathematics & Gaeilge - Thurles Camp...,Medium,Low,Tipperary,Football,Law
4805,Strength & Conditioning (SC701),Medium,Medium,Dublin,Singing,IT
4806,Strength & Conditioning (SC801),Medium,Medium,Dublin,Chess,Support
4807,Strength & Conditioning (SC601),Medium,Low,Dublin,Cricket,IT


In [49]:
#df.to_csv('Test.csv')

# Data Preprocessing

In [50]:
print("All the columns in the dataset: " , df.columns)

All the columns in the dataset:  Index(['Course_Name', 'CAO_Score', 'Budget', 'City', 'Interest', 'Job_domain'], dtype='object')


In [51]:
print('Shape of data is: %s entries and %s column'%(df.shape[0],df.shape[1]))

Shape of data is: 4808 entries and 6 column


In [52]:
print("Numeric Columns in DF: \n" , df.select_dtypes(include=np.number).columns.tolist())
print("\n\nCategorical Columns in DF: \n" , df.select_dtypes(include=['object']).columns.tolist())

Numeric Columns in DF: 
 []


Categorical Columns in DF: 
 ['Course_Name', 'CAO_Score', 'Budget', 'City', 'Interest', 'Job_domain']


In [53]:
df.isnull().sum(axis=0)

Course_Name    0
CAO_Score      0
Budget         0
City           0
Interest       0
Job_domain     0
dtype: int64

In [54]:
cat_col = df[['Course_Name', 'CAO_Score', 'Budget', 'City', 'Interest', 'Job_domain']]
for i in cat_col:
    print(df[i].value_counts(), end="\n\n")

Bachelor of Engineering Agricultural Systems Engineering (SE733)                                                                        24
Bachelor of Engineering (Honours) Agricultural Systems Engineering (SE732)                                                              24
Bachelor of Business (allowing later specialisation in Business or Business with a Language or Business with Information Technology)    24
Adult Green Cert - Part-time (5M20454 6S20487)                                                                                          13
Education in Teaching & Learning (WD603)                                                                                                12
                                                                                                                                        ..
Bachelor of Science (Honours) Architectural Technology                                                                                   1
Bachelor of Engineering (Ho

In [55]:
# print(df["University"].value_counts())

## Feature Selection & Engineering

One Hot Encoding For Categorical values

In [56]:
mycol = df[['CAO_Score', 'Budget']]
for i in mycol:
    cleanup_nums = {i: {"Low": 0, "Medium": 1, "High": 2}}
    df = df.replace(cleanup_nums)

cat_col = df[['City', 'Interest', 'Job_domain']]
for i in cat_col:
    df[i] = df[i].astype('category')
    df[i + "_code"] = df[i].cat.codes

print("\n\nList of Categorical values: \n" , df.select_dtypes(include=['object']).columns.tolist())



List of Categorical values: 
 ['Course_Name']


In [57]:
df.head(5)

Unnamed: 0,Course_Name,CAO_Score,Budget,City,Interest,Job_domain,City_code,Interest_code,Job_domain_code
0,Biomedical Engineering (DN150),2,1,Dublin,Cricket,IT,4,3,2
1,Chem & Bioprocess Engineering (NQS2),2,1,Dublin,Football,HR,4,4,1
2,Civil Engineering (DN150),2,0,Dublin,Singing,Management,4,5,4
3,Computer Science (DN201),2,0,Dublin,Chess,Support,4,2,5
4,Electrical/Electronics Engineering (DN150),2,1,Dublin,Athletics,Finance,4,0,0


In [58]:
print("\n\nList of Categorical values: \n" , df.select_dtypes(include=['object']).columns.tolist())
print("List of Numerical values: \n" , df.select_dtypes(include=np.number).columns.tolist())



List of Categorical values: 
 ['Course_Name']
List of Numerical values: 
 ['CAO_Score', 'Budget', 'City_code', 'Interest_code', 'Job_domain_code']


In [59]:
feed = df[['CAO_Score', 'Budget', 'City_code', 'Interest_code', 'Job_domain_code','Course_Name']]

# Choosing independent vars
df_train_x = feed.drop('Course_Name',axis = 1)

# Choosing all target vars
df_train_y = feed['Course_Name']

x_train, x_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size=0.50, random_state=42)

In [60]:
print("List of Numerical values: \n" , df.select_dtypes(include=np.number).columns.tolist())

List of Numerical values: 
 ['CAO_Score', 'Budget', 'City_code', 'Interest_code', 'Job_domain_code']


In [61]:
y_train = pd.DataFrame(y_train, columns = ["Course_Name"])
y_train.head()
y_test = pd.DataFrame(y_test, columns = ["Course_Name"])
y_test

Unnamed: 0,Course_Name
1448,Early Childhood Studies Care & Education (WD592)
2932,Athletic & Rehabilitation Therapy - Athlone Ca...
794,Augmented & Virtual Reality (DK722)
1029,Bachelor of Financial Mathematics and Economic...
8,Environmental Science and Engineering (TR064)
...,...
4587,Computer Engineering\r\nBachelor of Engineerin...
3598,Business - Management Stream (WD510)
3197,Adult Basic Education (48532)
2176,Engineering - Innovative Technology Engineerin...


#Implementing ML algotithms

Decision Tree

In [62]:
dtree = DecisionTreeClassifier(random_state=1)
dtree = dtree.fit(x_train, y_train)
y_pred = dtree.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
print("confusion matrics=",cm)
print("  ")
print("accuracy=",accuracy)

confusion matrics= [[4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 5 0 0]
 [0 0 0 ... 0 9 0]
 [0 0 0 ... 0 0 0]]
  
accuracy= 0.5956738768718802


In [63]:
userdata = [[1,0,4,4,1]]
ynewclass = dtree.predict(userdata)
ynew = dtree.predict_proba(userdata)
print(ynewclass)
print("Probabilities of all classes: ", ynew)
print("Probability of Predicted class : ", np.max(ynew))

['BA (Hons) in Audio and Music Technology']
Probabilities of all classes:  [[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.16666667 0.         0.         0.
  0.         0.         0

## RANDOM FOREST

In [64]:
rf = RandomForestClassifier(random_state = 10)
rf.fit(x_train, y_train)
rfc_y_pred = rf.predict(x_test)
rfc_cm = confusion_matrix(y_test,rfc_y_pred)
rfc_accuracy = accuracy_score(y_test,rfc_y_pred)
print("confusion matrics=",rfc_cm)
print("  ")
print("accuracy=",rfc_accuracy)

confusion matrics= [[4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 5 0 0]
 [0 0 0 ... 0 9 0]
 [0 0 0 ... 0 0 0]]
  
accuracy= 0.5956738768718802


In [65]:
userdata = [[2,0,4,2,5]]
ynewclass = rf.predict(userdata)
ynew = rf.predict_proba(userdata)
print(ynewclass)
print("Probabilities of all classes: ", ynew)
print("Probability of Predicted class : ", np.max(ynew))

['Bachelor of Science (Honours) in Biomedical & Molecular Diagnostics']
Probabilities of all classes:  [[0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        0.        0.
  0.        0.        0.        0.        0.        

In [66]:
import pickle 
pickle.dump(dtree,open('predict.pkl','wb'))

In [67]:
pickleFile=open("predict.pkl","rb")
#importing the model which we created using dt in ml code
regressor=pickle.load(pickleFile) 

In [68]:
!pip install Flask-Cors

Collecting Flask-Cors
  Downloading Flask_Cors-3.0.10-py2.py3-none-any.whl (14 kB)
Installing collected packages: Flask-Cors
Successfully installed Flask-Cors-3.0.10


In [69]:
from flask import Flask, request, render_template
from flask_cors import cross_origin
import sklearn
import pickle
import pandas as pd


app = Flask(__name__)
model = pickle.load(open("predict.pkl", "rb"))


@app.route("/")
@cross_origin()
def home():
    return render_template("home.html")

@app.route("/predict", methods = ["GET", "POST"])
@cross_origin()
def predict():
    if request.method == "POST":       


        City = request.form["City"]
        if (City == 'Dublin'):
            City_code = 4

        
        elif (City == 'Carlow'):
            City_code = 1


        elif (City == 'Letterkenny'):
            City_code = 8 


        elif (City == 'Limerick'):
            City_code = 9

        elif (City == 'Athlone'):
            City_code = 0

        elif (City == 'Dundalk'):
            City_code = 5

        elif (City == 'Cork'):
            City_code = 2

        elif (City == 'Mayo'):
            City_code = 10
            
        elif (City == 'Donegal'):
            City_code = 3

        elif (City == 'Waterford'):
            City_code = 13

        elif (City == 'Wexford'):
            City_code = 14

        elif (City == 'Tipperary'):
            City_code = 12
            
        elif (City == 'Sligo'):
            City_code = 11

        elif (City == 'Galway'):
            City_code = 6


        else:
            City_code = 0

            
        Interest = request.form["Interest"]
        if (Interest == 'Cricket'):
            Interest_code = 4

        
        elif (Interest == 'Football'):
            Interest_code = 1


        elif (Interest == 'Singing'):
            Interest_code = 8 


        elif (Interest == 'Chess'):
            Interest_code = 9

        elif (Interest == 'Athletics'):
            Interest_code = 0

        elif (Interest == 'Automation'):
            Interest_code = 5

        else:
            Interest_code = 0

        Job_domain = request.form["Job_domain"]
        
        if (Job_domain == 'IT'):
            Job_domain_code = 2

        
        elif (Job_domain == 'HR'):
            Job_domain_code = 1


        elif (Job_domain == 'Management'):
            Job_domain_code = 4 


        elif (Job_domain == 'Support'):
            Job_domain_code = 5

        elif (Interest == 'Finance'):
            Job_domain_code = 0

        elif (Job_domain == 'Law'):
            Job_domain_code = 3

        else:
            Job_domain_code = 0
            


        Budget = request.form["Budget"]
        if (Budget == 'Low'):
            Budget = 1

        
        elif (Budget == 'Medium'):
            Budget = 2


        elif (Budget == 'High'):
            Budget = 3 


        else:
            Budget = 1
            

        CAO_Score = request.form["CAO_Score"]
        if (CAO_Score == 'Low'):
            CAO_Score = 1

        
        elif (CAO_Score == 'Medium'):
            CAO_Score = 2


        elif (CAO_Score == 'High'):
            CAO_Score = 3 


        else:
            CAO_Score = 1
            

        prediction=model.predict([[
            2,
            Budget,
            City_code,
            Interest_code,
            4
            #Job_domain_code
            
        ]])

        output=(prediction[0],2)

        return render_template('home.html',prediction_text="The College recommended is. {}".format(output))


        return render_template("home.html")




if __name__ == "__main__":
    app.run(debug=True,use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


In [None]:
from flask import Flask, request, render_template
from flask_cors import cross_origin
import sklearn
import pickle
import pandas as pd


app = Flask(__name__)
model = pickle.load(open("predict.pkl", "rb"))

prediction=model.predict([[
            2,
            3,
            4,
            2,
            4
            #Job_domain_code
            
        ]])
output=(prediction[0],2)
print(output)

('Bachelor of Genetics and Genomics (GY321)', 2)
