Importing the dependencies

In [52]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

Data Collection and Pre-Processing

In [53]:
# loading the data from the csv file to apandas dataframe
data = pd.read_csv('/content/Properties.csv')

In [54]:
# printing the first 5 rows of the dataframe
data.head()

Unnamed: 0,Index,Property Address,Insight 1,Insight 2,Driver,Account,Criticality,Roles
0,1,"2483 Olson Via St, Houston",Budgets,Janitorial Expenses 10% Over Mean,Energy Costs above 15% mean,Bank of ACME,High,Asset Manager
1,2,"8389 Chad Village St, Houston",Energy Usage,Off-Hours,High energy usage during peak hours,Quantum Dynamics,Critical,Chief Engineer
2,3,"3427 James Shoal St, Boston",Energy Usage,Off-Hours,High energy usage during peak hours,StreamView Media,Low,Chief Engineer
3,4,"9659 Angela Estates St, New York",Energy Usage,Energy Efficiency,High energy usage during peak hours,NexaRetail,Low,Chief Engineer
4,5,"6752 Cameron Extension St, Atlanta",Budgets,Budget Compliance,Janitorial Budget above 20% mean,GreenLeaf Co.,Critical,Asset Manager


In [55]:
# number of rows and columns in the data frame
data.shape

(1300, 8)

In [56]:
# selecting the relevant features for recommendation

selected_features = ['Insight 1', 'Insight 2', 'Driver']
print(selected_features)

['Insight 1', 'Insight 2', 'Driver']


In [57]:
# replacing the null valuess with null string

for feature in selected_features:
  data[feature] = data[feature].fillna('')

In [58]:
# combining all the selected features

combined_features = data['Insight 1']+" "+data['Insight 2']+" "+data['Driver']

In [59]:
print(combined_features)

0       Budgets Janitorial Expenses 10% Over Mean Ener...
1       Energy Usage Off-Hours High energy usage durin...
2       Energy Usage Off-Hours High energy usage durin...
3       Energy Usage Energy Efficiency High energy usa...
4       Budgets Budget Compliance Janitorial Budget ab...
                              ...                        
1295    Work Orders Completion Rate Completion Rates b...
1296    Work Orders Preventative High number of reacti...
1297    Budgets Energy Cost Variance Janitorial Budget...
1298         Energy Usage Off-Hours Low energy efficiency
1299    Accessibility ADA Compliance Non-ADA compliant...
Length: 1300, dtype: object


In [60]:
# converting the text data to feature vectors

vectorizer = TfidfVectorizer()

In [61]:
feature_vectors = vectorizer.fit_transform(combined_features)

In [62]:
print(feature_vectors)

  (0, 1)	0.302329894295011
  (0, 4)	0.24109484785845453
  (0, 20)	0.302329894295011
  (0, 26)	0.19292761507093373
  (0, 46)	0.48218969571690906
  (0, 57)	0.34383349113234274
  (0, 0)	0.34383349113234274
  (0, 27)	0.34383349113234274
  (0, 39)	0.2766720848189496
  (0, 13)	0.24109484785845453
  (1, 58)	0.28429306690738443
  (1, 22)	0.30489207969578264
  (1, 35)	0.21055718350254776
  (1, 36)	0.51964587374194
  (1, 55)	0.33914595716593493
  (1, 77)	0.4895230538617489
  (1, 26)	0.39648368150718805
  (2, 58)	0.28429306690738443
  (2, 22)	0.30489207969578264
  (2, 35)	0.21055718350254776
  (2, 36)	0.51964587374194
  (2, 55)	0.33914595716593493
  (2, 77)	0.4895230538617489
  (2, 26)	0.39648368150718805
  (3, 23)	0.2901889772498553
  :	:
  (1296, 59)	0.37932048567315546
  (1296, 56)	0.5353049538241155
  (1296, 79)	0.5353049538241155
  (1296, 35)	0.22223943949336775
  (1297, 78)	0.4068328845283629
  (1297, 19)	0.4068328845283629
  (1297, 2)	0.36739947281736207
  (1297, 12)	0.3384948799478783
  (

Cosine Similarity

In [63]:
# getting the similarity scores using cosine similarity

similarity = cosine_similarity(feature_vectors)

In [64]:
print(similarity)

[[1.         0.07649265 0.07649265 ... 0.41776706 0.10382637 0.        ]
 [0.07649265 1.         1.         ... 0.09228524 0.71538712 0.        ]
 [0.07649265 1.         1.         ... 0.09228524 0.71538712 0.        ]
 ...
 [0.41776706 0.09228524 0.09228524 ... 1.         0.12526225 0.        ]
 [0.10382637 0.71538712 0.71538712 ... 0.12526225 1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [65]:
print(similarity.shape)

(1300, 1300)


Getting the job name from the user

# New Section

In [66]:
name = input(' Enter your Role : ')

 Enter your Role : Technician


In [67]:
# creating a list with all the job roles given in the dataset

list_of_all_titles = data['Roles'].tolist()
print(list_of_all_titles)
print(len(list_of_all_titles))
cleanedList = [x for x in list_of_all_titles if str(x) != 'nan']
print(cleanedList)

['Asset Manager', 'Chief Engineer', 'Chief Engineer', 'Chief Engineer', 'Asset Manager', 'Maintenance Supervisor', 'Asset Manager', 'Chief Engineer', 'Leasing Manager', 'Leasing Manager', 'Chief Engineer', 'Facility Coordinator', 'Leasing Manager', 'Facility Coordinator', 'Facility Coordinator', 'Maintenance Supervisor', 'Leasing Manager', 'Maintenance Supervisor', 'Leasing Manager', 'Asset Manager', 'Maintenance Supervisor', 'Chief Engineer', 'Asset Manager', 'Maintenance Supervisor', 'Leasing Manager', 'Chief Engineer', 'Chief Engineer', 'Chief Engineer', 'Maintenance Supervisor', 'Maintenance Supervisor', 'Maintenance Supervisor', 'Facility Coordinator', 'Asset Manager', 'Maintenance Supervisor', 'Leasing Manager', 'Asset Manager', 'Leasing Manager', 'Maintenance Supervisor', 'Maintenance Supervisor', 'Leasing Manager', 'Leasing Manager', 'Facility Coordinator', 'Chief Engineer', 'Chief Engineer', 'Maintenance Supervisor', 'Asset Manager', 'Asset Manager', 'Facility Coordinator', 'M

In [68]:
# finding the close match for the movie name given by the user

find_close_match = difflib.get_close_matches(name, list_of_all_titles, n = 5, cutoff = 0.1)
print(find_close_match)

['Technician', 'Technician', 'Technician', 'Technician', 'Technician']


In [69]:
close_match = find_close_match[0]
print(close_match)

Technician


In [70]:
# finding the index of the similar insights

index_of_the_movie = data[data.Roles == close_match]['Index'].values
print(index_of_the_movie)

[1001 1007 1008 1017 1024 1025 1047 1061 1072 1084 1100 1108 1123 1143
 1156 1161 1167 1168 1175 1185 1191 1197 1198 1202 1218 1224 1231 1244
 1248 1259 1262 1275 1278 1300]


In [71]:
for index1 in index_of_the_movie:
  #print(index1)
  print(data['Driver'][index1-1])

Frequent elevator breakdowns
Lack of nearby public transport
Lack of nearby public transport
Non-ADA compliant areas
Lack of nearby public transport
Frequent elevator breakdowns
Lack of nearby public transport
Frequent elevator breakdowns
Lack of nearby public transport
Non-ADA compliant areas
Non-ADA compliant areas
Lack of nearby public transport
Frequent elevator breakdowns
Frequent elevator breakdowns
Lack of nearby public transport
Non-ADA compliant areas
Non-ADA compliant areas
Non-ADA compliant areas
Frequent elevator breakdowns
Non-ADA compliant areas
Lack of nearby public transport
Frequent elevator breakdowns
Frequent elevator breakdowns
Lack of nearby public transport
Non-ADA compliant areas
Frequent elevator breakdowns
Lack of nearby public transport
Frequent elevator breakdowns
Non-ADA compliant areas
Lack of nearby public transport
Non-ADA compliant areas
Non-ADA compliant areas
Frequent elevator breakdowns
Non-ADA compliant areas


In [72]:
# getting a list of similar insights

similarity_score = list(enumerate(similarity[index_of_the_movie[0]]))
print(similarity_score)

[(0, 0.0), (1, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.08943326869062102), (8, 0.5900031538450032), (9, 0.5900031538450032), (10, 0.11211176267735667), (11, 0.0), (12, 0.20752100110292374), (13, 0.0), (14, 0.0), (15, 0.2110822939919808), (16, 0.20752100110292374), (17, 0.1094995404906161), (18, 1.0000000000000002), (19, 0.0), (20, 0.11147275374792427), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.7689437353078445), (25, 0.11211176267735667), (26, 0.10852255383031433), (27, 0.0), (28, 0.10729505874069185), (29, 0.11147275374792427), (30, 0.0), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.18923240792459853), (35, 0.0), (36, 0.20752100110292374), (37, 0.1094995404906161), (38, 0.2110822939919808), (39, 0.5900031538450032), (40, 0.20752100110292374), (41, 0.0), (42, 0.10852255383031433), (43, 0.08943326869062102), (44, 0.10729505874069185), (45, 0.0), (46, 0.0), (47, 0.0), (48, 0.10729505874069185), (49, 0.20752100110292374), (50, 0.0), (51, 0.08943326869062102), (52, 0.0), (53

In [73]:
len(similarity_score)

1300

In [74]:
# sorting the insights based on their similarity score

sorted_similar = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_similar)

[(18, 1.0000000000000002), (74, 1.0000000000000002), (107, 1.0000000000000002), (137, 1.0000000000000002), (163, 1.0000000000000002), (165, 1.0000000000000002), (315, 1.0000000000000002), (341, 1.0000000000000002), (350, 1.0000000000000002), (367, 1.0000000000000002), (443, 1.0000000000000002), (451, 1.0000000000000002), (464, 1.0000000000000002), (492, 1.0000000000000002), (494, 1.0000000000000002), (543, 1.0000000000000002), (590, 1.0000000000000002), (604, 1.0000000000000002), (676, 1.0000000000000002), (711, 1.0000000000000002), (723, 1.0000000000000002), (745, 1.0000000000000002), (942, 1.0000000000000002), (1001, 1.0000000000000002), (1079, 1.0000000000000002), (1108, 1.0000000000000002), (1115, 1.0000000000000002), (1185, 1.0000000000000002), (1210, 1.0000000000000002), (24, 0.7689437353078445), (64, 0.7689437353078445), (248, 0.7689437353078445), (249, 0.7689437353078445), (266, 0.7689437353078445), (327, 0.7689437353078445), (366, 0.7689437353078445), (378, 0.7689437353078445)

In [75]:
# print the name of similar insights based on the index

print('Roles suggested for you : \n')

i = 1
for insight in sorted_similar:
  index = insight[0]
  title_from_index = data[data.index==index]['Insight 1']
  if (i<5):
    print(i, '.',title_from_index)
    i+=1

Roles suggested for you : 

1 . 18    Leases
Name: Insight 1, dtype: object
2 . 74    Leases
Name: Insight 1, dtype: object
3 . 107    Leases
Name: Insight 1, dtype: object
4 . 137    Leases
Name: Insight 1, dtype: object


Recommendation Sytem

In [77]:
name = input(' Enter your Role : ')

list_of_all_titles = data['Roles'].tolist()

cleanedList = [x for x in list_of_all_titles if str(x) != 'nan']

find_close_match = difflib.get_close_matches(name, cleanedList)

close_match = find_close_match[0]

index_of_the_m = data[data.Roles == close_match]['Index'].values[0]

similarity_score = list(enumerate(similarity[index_of_the_m]))

sorted_similar_m = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print('Roles suggested for you : \n')

i = 1

for insight in sorted_similar_m:
  index = insight[0]
  title_from_index = data[data.index==index]['Insight 2'].values[0]
  if (i<30):
    print(i, '.',title_from_index)
    i+=1

 Enter your Role : Technician
Roles suggested for you : 

1 . Renewal Rate
2 . Renewal Rate
3 . Renewal Rate
4 . Renewal Rate
5 . Renewal Rate
6 . Renewal Rate
7 . Renewal Rate
8 . Renewal Rate
9 . Renewal Rate
10 . Renewal Rate
11 . Renewal Rate
12 . Renewal Rate
13 . Renewal Rate
14 . Renewal Rate
15 . Renewal Rate
16 . Renewal Rate
17 . Renewal Rate
18 . Renewal Rate
19 . Renewal Rate
20 . Renewal Rate
21 . Renewal Rate
22 . Renewal Rate
23 . Renewal Rate
24 . Renewal Rate
25 . Renewal Rate
26 . Renewal Rate
27 . Renewal Rate
28 . Renewal Rate
29 . Renewal Rate
