In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings("ignore")

In [2]:
pwd

'c:\\Users\\rahul\\Desktop\\Group Project\\repository\\MyUniLifeSimulation\\SRC\\Recommender System src\\Recommender System Backup'

In [3]:
df = pd.read_csv('recommender_system_data.csv',encoding='cp1252')
df.head()

Unnamed: 0,University,Course_Name,CAO Score,Budget,City,Interest,Job domain
0,University College Dublin,Biomedical Engineering (DN150),556,6000,Dublin,Cricket,IT
1,University College Dublin,Chem & Bioprocess Engineering (NQS2),528,5000,Dublin,Football,HR
2,University College Dublin,Civil Engineering (DN150),551,4500,Dublin,Singing,Management
3,University College Dublin,Computer Science (DN201),542,3500,Dublin,Chess,Support
4,University College Dublin,Electrical/Electronics Engineering (DN150),555,6500,Dublin,Athletics,Finance


In [4]:
df['University']=df['University'].str.strip()
df['Course_Name']=df['Course_Name'].str.strip()
df=df.drop(['University'], axis=1)
df.rename(columns = {'CAO Score':'CAO_Score'}, inplace = True)
df.rename(columns = {'Job domain':'Job_domain'}, inplace = True)

In [5]:
def CAO_SCORE(score):
    if  (score >= 50)  and (score <250 ) :
        return "Low"
    elif  (score >= 250)  and (score <500 ) :
        return "Medium"
    elif (score >= 500):
        return "High"
    #else: return 'Neutral'

df['CAO_Score'] = df['CAO_Score'].apply(CAO_SCORE)

In [6]:
def Budget(Budget):
    if  (Budget >= 2000)  and (Budget <5000 ) :
        return "Low"
    elif  (Budget >= 5000)  and (Budget <7000 ) :
        return "Medium"
    elif (Budget >= 7000):
        return "High"
    #else: return 'Neutral'

df['Budget'] = df['Budget'].apply(Budget)

In [7]:
df.tail()

Unnamed: 0,Course_Name,CAO_Score,Budget,City,Interest,Job_domain
4803,Education Mathematics & Business Studies - Thu...,Medium,Low,Tipperary,Cricket,Support
4804,Education Mathematics & Gaeilge - Thurles Camp...,Medium,Low,Tipperary,Football,Law
4805,Strength & Conditioning (SC701),Medium,Medium,Dublin,Singing,IT
4806,Strength & Conditioning (SC801),Medium,Medium,Dublin,Chess,Support
4807,Strength & Conditioning (SC601),Medium,Low,Dublin,Cricket,IT


In [8]:
#df.to_csv('Test.csv')

# Data Preprocessing

In [9]:
print("All the columns in the dataset: " , df.columns)

All the columns in the dataset:  Index(['Course_Name', 'CAO_Score', 'Budget', 'City', 'Interest', 'Job_domain'], dtype='object')


In [10]:
print('Shape of data is: %s entries and %s column'%(df.shape[0],df.shape[1]))

Shape of data is: 4808 entries and 6 column


In [11]:
print("Numeric Columns in DF: \n" , df.select_dtypes(include=np.number).columns.tolist())
print("\n\nCategorical Columns in DF: \n" , df.select_dtypes(include=['object']).columns.tolist())

Numeric Columns in DF: 
 []


Categorical Columns in DF: 
 ['Course_Name', 'CAO_Score', 'Budget', 'City', 'Interest', 'Job_domain']


In [12]:
df.isnull().sum(axis=0)

Course_Name    0
CAO_Score      0
Budget         0
City           0
Interest       0
Job_domain     0
dtype: int64

In [13]:
cat_col = df[['Course_Name', 'CAO_Score', 'Budget', 'City', 'Interest', 'Job_domain']]
for i in cat_col:
    print(df[i].value_counts(), end="\n\n")

Bachelor of Engineering Agricultural Systems Engineering (SE733)                                                                        24
Bachelor of Engineering (Honours) Agricultural Systems Engineering (SE732)                                                              24
Bachelor of Business (allowing later specialisation in Business or Business with a Language or Business with Information Technology)    24
Adult Green Cert - Part-time (5M20454 6S20487)                                                                                          13
Education in Teaching & Learning (WD603)                                                                                                12
                                                                                                                                        ..
Bachelor of Science (Honours) Architectural Technology                                                                                   1
Bachelor of Engineering (Ho

In [14]:
# print(df["University"].value_counts())

## Feature Selection & Engineering

One Hot Encoding For Categorical values

In [15]:
mycol = df[['CAO_Score', 'Budget']]
for i in mycol:
    cleanup_nums = {i: {"Low": 0, "Medium": 1, "High": 2}}
    df = df.replace(cleanup_nums)

cat_col = df[['City', 'Interest', 'Job_domain']]
for i in cat_col:
    df[i] = df[i].astype('category')
    df[i + "_code"] = df[i].cat.codes

print("\n\nList of Categorical values: \n" , df.select_dtypes(include=['object']).columns.tolist())



List of Categorical values: 
 ['Course_Name']


In [16]:
df.head(5)

Unnamed: 0,Course_Name,CAO_Score,Budget,City,Interest,Job_domain,City_code,Interest_code,Job_domain_code
0,Biomedical Engineering (DN150),2,1,Dublin,Cricket,IT,4,3,2
1,Chem & Bioprocess Engineering (NQS2),2,1,Dublin,Football,HR,4,4,1
2,Civil Engineering (DN150),2,0,Dublin,Singing,Management,4,5,4
3,Computer Science (DN201),2,0,Dublin,Chess,Support,4,2,5
4,Electrical/Electronics Engineering (DN150),2,1,Dublin,Athletics,Finance,4,0,0


In [17]:
print("\n\nList of Categorical values: \n" , df.select_dtypes(include=['object']).columns.tolist())
print("List of Numerical values: \n" , df.select_dtypes(include=np.number).columns.tolist())



List of Categorical values: 
 ['Course_Name']
List of Numerical values: 
 ['CAO_Score', 'Budget', 'City_code', 'Interest_code', 'Job_domain_code']


In [18]:
feed = df[['CAO_Score', 'Budget', 'City_code', 'Interest_code', 'Job_domain_code','Course_Name']]

# Choosing independent vars
df_train_x = feed.drop('Course_Name',axis = 1)

# Choosing all target vars
df_train_y = feed['Course_Name']

x_train, x_test, y_train, y_test = train_test_split(df_train_x, df_train_y, test_size=0.50, random_state=42)

In [19]:
print("List of Numerical values: \n" , df.select_dtypes(include=np.number).columns.tolist())

List of Numerical values: 
 ['CAO_Score', 'Budget', 'City_code', 'Interest_code', 'Job_domain_code']


In [20]:
y_train = pd.DataFrame(y_train, columns = ["Course_Name"])
y_train.head()
y_test = pd.DataFrame(y_test, columns = ["Course_Name"])
y_test

Unnamed: 0,Course_Name
1448,Early Childhood Studies Care & Education (WD592)
2932,Athletic & Rehabilitation Therapy - Athlone Ca...
794,Augmented & Virtual Reality (DK722)
1029,Bachelor of Financial Mathematics and Economic...
8,Environmental Science and Engineering (TR064)
...,...
4587,Computer Engineering\r\nBachelor of Engineerin...
3598,Business - Management Stream (WD510)
3197,Adult Basic Education (48532)
2176,Engineering - Innovative Technology Engineerin...


#Implementing ML algotithms

Decision Tree

In [21]:
dtree = DecisionTreeClassifier(random_state=1)
dtree = dtree.fit(x_train, y_train)
y_pred = dtree.predict(x_test)
cm = confusion_matrix(y_test,y_pred)
accuracy = accuracy_score(y_test,y_pred)
print("confusion matrics=",cm)
print("  ")
print("accuracy=",accuracy)

confusion matrics= [[4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 5 0 0]
 [0 0 0 ... 0 9 0]
 [0 0 0 ... 0 0 0]]
  
accuracy= 0.5956738768718802


K-Fold Cross Validation

In [22]:
from sklearn.model_selection import cross_val_score
dtree = DecisionTreeClassifier(random_state=42)
scores = cross_val_score(dtree, x_train, y_train, cv = 10)
scores

array([0.65975104, 0.63070539, 0.63070539, 0.63070539, 0.65      ,
       0.6       , 0.6       , 0.60833333, 0.60833333, 0.625     ])

In [23]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean()*1, scores.std()))

0.62 accuracy with a standard deviation of 0.02


## RANDOM FOREST

In [24]:
rf = RandomForestClassifier(random_state = 42)
rf.fit(x_train, y_train)
rfc_y_pred = rf.predict(x_test)
rfc_cm = confusion_matrix(y_test,rfc_y_pred)
rfc_accuracy = accuracy_score(y_test,rfc_y_pred)
print("confusion matrics=",rfc_cm)
print("  ")
print("accuracy=",rfc_accuracy)

confusion matrics= [[4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 5 0 0]
 [0 0 0 ... 0 9 0]
 [0 0 0 ... 0 0 0]]
  
accuracy= 0.5956738768718802


K-Fold Cross Validation

In [25]:
rf = RandomForestClassifier(random_state = 42)
scores = cross_val_score(rf, x_train, y_train, cv = 10)
scores

array([0.65975104, 0.63900415, 0.63485477, 0.63485477, 0.65      ,
       0.6125    , 0.6       , 0.61666667, 0.62916667, 0.625     ])

In [26]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.63 accuracy with a standard deviation of 0.02


### SVM

In [27]:
from sklearn import svm
clf = svm.SVC(kernel='linear') # Linear Kernel
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[[4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 5 0 0]
 [0 0 0 ... 0 9 0]
 [0 0 0 ... 0 0 0]]


0.5973377703826955

K-Fold Cross Validaiton

In [28]:
from sklearn import svm
svm = svm.SVC(kernel='linear')
scores_svm = cross_val_score(svm, x_train, y_train, cv = 10)
scores_svm

array([0.64315353, 0.63900415, 0.61825726, 0.62655602, 0.64166667,
       0.6125    , 0.60833333, 0.62083333, 0.62916667, 0.63333333])

In [29]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_svm.mean(), scores_svm.std()))

0.63 accuracy with a standard deviation of 0.01


### Logistic Regression

In [30]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression(random_state=42)
reg.fit(x_train, y_train)
y_pred = reg.predict(x_test)


In [31]:
y_pred = reg.predict(x_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[[4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 5 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


0.3847753743760399

K-Fold Cross Validation

In [32]:
reg = LogisticRegression()
scores_reg = cross_val_score(reg, x_train, y_train, cv = 10)
scores_reg

array([0.48547718, 0.46887967, 0.46058091, 0.43153527, 0.475     ,
       0.44583333, 0.45      , 0.44166667, 0.42916667, 0.4375    ])

In [33]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_reg.mean()*100, scores_reg.std()))

45.26 accuracy with a standard deviation of 0.02


### KNN

In [34]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=4)
neigh.fit(x_train, y_train)

In [35]:
y_pred = neigh.predict(x_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[[4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 5 0 0]
 [0 0 0 ... 0 9 0]
 [0 0 0 ... 0 0 0]]


0.6085690515806988

K-Fold Cross Validation

In [36]:
knn = KNeighborsClassifier(n_neighbors=4)
scores_knn = cross_val_score(knn, x_train, y_train, cv = 10)
scores_knn

array([0.62240664, 0.62655602, 0.60995851, 0.59751037, 0.64166667,
       0.59583333, 0.60416667, 0.625     , 0.6375    , 0.62916667])

In [37]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_knn.mean()*100, scores_knn.std()))

61.90 accuracy with a standard deviation of 0.02


### Naive-Bayes

In [38]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)

In [39]:
# Model Performance
y_pred = classifier.predict(x_test)

from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)

[[4 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 ...
 [0 0 0 ... 5 0 0]
 [0 0 0 ... 0 9 0]
 [0 0 0 ... 0 0 0]]


0.5965058236272879

K-Fold Cross Validation

In [40]:
gnb = GaussianNB()
scores_gnb = cross_val_score(gnb, x_train, y_train, cv = 10)
scores_gnb

array([0.65560166, 0.63070539, 0.63070539, 0.62655602, 0.65      ,
       0.60416667, 0.6       , 0.6125    , 0.61666667, 0.62916667])

In [41]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores_gnb.mean()*100, scores_gnb.std()))

62.56 accuracy with a standard deviation of 0.02


# Model Evaluations

## Classification Reports

### Random Forest Classification Report

In [43]:
from sklearn.metrics import classification_report
rfc = RandomForestClassifier(random_state=1)
rfc.fit(x_train, y_train)
y_pred_classification = rfc.predict(x_test)

report = classification_report(y_pred_classification, y_test)

print("Classification Report: ")
print(report)

Classification Report: 
                                                                                                                                      precision    recall  f1-score   support

                                                                                 3D CAD & Solid Modelling - Cork Campus (CR_ECADM_6)       1.00      1.00      1.00         4
                                                                                       3D Computer aided design - Beginners (109425)       0.00      0.00      0.00         3
                                                                                                          Accountancy - ACCA (94479)       0.00      0.00      0.00         0
                                                                                                           Accountancy - CPA (56495)       0.00      0.00      0.00         0
                                       Accountants - Institute of Certified Public Accountants in Ireland

### KNN Classification Report

In [44]:
knn = KNeighborsClassifier(n_neighbors = 4)
knn.fit(x_train, y_train)
y_pred_classification = knn.predict(x_test)

report = classification_report(y_pred_classification, y_test)

print("Classification Report: ")
print(report)

Classification Report: 
                                                                                                                                      precision    recall  f1-score   support

                                                                                 3D CAD & Solid Modelling - Cork Campus (CR_ECADM_6)       1.00      0.67      0.80         6
                                                                                       3D Computer aided design - Beginners (109425)       0.00      0.00      0.00         2
                                                                                                          Accountancy - ACCA (94479)       0.00      0.00      0.00         0
                                                                                                           Accountancy - CPA (56495)       0.00      0.00      0.00         0
                                       Accountants - Institute of Certified Public Accountants in Ireland

### SVM Classification Report

In [47]:
from sklearn.metrics import classification_report
from sklearn.svm import SVC
clf = svm.SVC(kernel='linear') # Linear Kernel
clf.fit(x_train, y_train)
y_pred_classification = clf.predict(x_test)

report = classification_report(y_pred_classification, y_test)

print("Classification Report: ")
print(report)

AttributeError: 'SVC' object has no attribute 'SVC'

### Naive Bayes Classification Report

In [48]:
classifier = GaussianNB()
classifier.fit(x_train, y_train)
y_pred_classification = classifier.predict(x_test)

report = classification_report(y_pred_classification, y_test)

print("Classification Report: ")
print(report)

Classification Report: 
                                                                                                                                      precision    recall  f1-score   support

                                                                                 3D CAD & Solid Modelling - Cork Campus (CR_ECADM_6)       1.00      0.80      0.89         5
                                                                                       3D Computer aided design - Beginners (109425)       0.00      0.00      0.00         2
                                                                                                          Accountancy - ACCA (94479)       0.00      0.00      0.00         0
                                                                                                           Accountancy - CPA (56495)       0.00      0.00      0.00         0
                                       Accountants - Institute of Certified Public Accountants in Ireland

### Logistic Regression Classification Report

In [49]:
reg = LogisticRegression()
reg.fit(x_train, y_train)
y_pred_classification = reg.predict(x_test)

report = classification_report(y_pred_classification, y_test)

print("Classification Report: ")
print(report)

Classification Report: 
                                                                                                                                      precision    recall  f1-score   support

                                                                                 3D CAD & Solid Modelling - Cork Campus (CR_ECADM_6)       1.00      1.00      1.00         4
                                                                                                          Accountancy - ACCA (94479)       0.00      0.00      0.00         0
                                                                                                           Accountancy - CPA (56495)       0.00      0.00      0.00         0
                                       Accountants - Institute of Certified Public Accountants in Ireland - Cork Campus (CR_BCPAC_8)       0.00      0.00      0.00         0
                                                                             Accounting & Business - ACCA

### Decision Tree Classification Report

In [51]:
dt = DecisionTreeClassifier(random_state=1)
dt.fit(x_train, y_train)
y_pred_classification = dt.predict(x_test)

report = classification_report(y_pred_classification, y_test)

print("Classification Report: ")
print(report)

Classification Report: 
                                                                                                                                      precision    recall  f1-score   support

                                                                                 3D CAD & Solid Modelling - Cork Campus (CR_ECADM_6)       1.00      0.80      0.89         5
                                                                                       3D Computer aided design - Beginners (109425)       0.00      0.00      0.00         1
                                                                                                          Accountancy - ACCA (94479)       0.00      0.00      0.00         0
                                                                                                           Accountancy - CPA (56495)       0.00      0.00      0.00         0
                                       Accountants - Institute of Certified Public Accountants in Ireland

In [None]:
import pickle 
pickle.dump(dtree,open('predict.pkl','wb'))

In [None]:
pickleFile=open("predict.pkl","rb")
#importing the model which we created using dt in ml code
regressor=pickle.load(pickleFile) 

In [None]:
!pip install Flask-Cors



In [None]:
from flask import Flask, request, render_template
from flask_cors import cross_origin
import sklearn
import pickle
import pandas as pd


app = Flask(__name__)
model = pickle.load(open("predict.pkl", "rb"))


@app.route("/")
@cross_origin()
def home():
    return render_template("home.html")

@app.route("/predict", methods = ["GET", "POST"])
@cross_origin()
def predict():
    if request.method == "POST":       


        City = request.form["City"]
        if (City == 'Dublin'):
            City_code = 4

        
        elif (City == 'Carlow'):
            City_code = 1


        elif (City == 'Letterkenny'):
            City_code = 8 


        elif (City == 'Limerick'):
            City_code = 9

        elif (City == 'Athlone'):
            City_code = 0

        elif (City == 'Dundalk'):
            City_code = 5

        elif (City == 'Cork'):
            City_code = 2

        elif (City == 'Mayo'):
            City_code = 10
            
        elif (City == 'Donegal'):
            City_code = 3

        elif (City == 'Waterford'):
            City_code = 13

        elif (City == 'Wexford'):
            City_code = 14

        elif (City == 'Tipperary'):
            City_code = 12
            
        elif (City == 'Sligo'):
            City_code = 11

        elif (City == 'Galway'):
            City_code = 6


        else:
            City_code = 0

            
        Interest = request.form["Interest"]
        if (Interest == 'Cricket'):
            Interest_code = 4

        
        elif (Interest == 'Football'):
            Interest_code = 1


        elif (Interest == 'Singing'):
            Interest_code = 8 


        elif (Interest == 'Chess'):
            Interest_code = 9

        elif (Interest == 'Athletics'):
            Interest_code = 0

        elif (Interest == 'Automation'):
            Interest_code = 5

        else:
            Interest_code = 0

        Job_domain = request.form["Job_domain"]
        
        if (Job_domain == 'IT'):
            Job_domain_code = 2

        
        elif (Job_domain == 'HR'):
            Job_domain_code = 1


        elif (Job_domain == 'Management'):
            Job_domain_code = 4 


        elif (Job_domain == 'Support'):
            Job_domain_code = 5

        elif (Interest == 'Finance'):
            Job_domain_code = 0

        elif (Job_domain == 'Law'):
            Job_domain_code = 3

        else:
            Job_domain_code = 0
            


        Budget = request.form["Budget"]
        if (Budget == 'Low'):
            Budget = 1

        
        elif (Budget == 'Medium'):
            Budget = 2


        elif (Budget == 'High'):
            Budget = 3 


        else:
            Budget = 1
            

        CAO_Score = request.form["CAO_Score"]
        if (CAO_Score == 'Low'):
            CAO_Score = 1

        
        elif (CAO_Score == 'Medium'):
            CAO_Score = 2


        elif (CAO_Score == 'High'):
            CAO_Score = 3 


        else:
            CAO_Score = 1
            

        prediction=model.predict([[
            2,
            Budget,
            City_code,
            Interest_code,
            4
            #Job_domain_code
            
        ]])

        output=(prediction[0],2)

        return render_template('home.html',prediction_text="The College recommended is. {}".format(output))


        return render_template("home.html")




if __name__ == "__main__":
    app.run(debug=True,use_reloader=False)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit


In [None]:
from flask import Flask, request, render_template
from flask_cors import cross_origin
import sklearn
import pickle
import pandas as pd


app = Flask(__name__)
model = pickle.load(open("predict.pkl", "rb"))

prediction=model.predict([[
            2,
            3,
            4,
            2,
            4
            #Job_domain_code
            
        ]])
output=(prediction[0],2)
print(output)

('Bachelor of Genetics and Genomics (GY321)', 2)
