# Setup Enviroment


In [76]:
import pandas as pd
import numpy as np
import pymongo
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score

##### Mongo Connection Details

In [2]:
MONGODB_SERVER = "localhost"
MONGODB_PORT = 27017
MONGODB_DB = "courses"

##### Load Mongo Collections for udemy and udacity to merge 

prepare to clean and insert the data into the cleaned collection

In [3]:
mongo_client = pymongo.MongoClient('mongodb://localhost:27017')
db = mongo_client.courses

In [4]:
cleaned_courses_col=db.cleaned_courses
cleaned_courses_col.drop()
cleaned_courses=[]

In [5]:
udacity_courses=db.udacity_courses
udemy_courses=db.udemy_courses

##### get cleaned udacity courses

In [6]:
for udacity_course in udacity_courses.find():
    data = {
        "title": udacity_course['name'],
        "url": udacity_course['url'],
        "description": udacity_course['about'],
        "syllabus": 'Not Avilable',
        "skills": udacity_course['skills'],
        "ratings_count": udacity_course['num_reviews'],
        "prerequisites": udacity_course['prerequisites'],
        "duration": udacity_course['duration'],
        "category": udacity_course['category'],
        "level": udacity_course['level'],
        "schoolName": udacity_course['school_name'],
        "instructors": udacity_course['instructors'],
        "enrolled_students_count": 0,
        "avg_rating": udacity_course['avg_rating'],
        "num_reviews": udacity_course['num_reviews'],
        "price": udacity_course['price'],
        "source": 'Udacity'
    }
    cleaned_courses_col.insert_one(data)

##### get cleaned udemy courses

In [7]:
import html2text
for udemy_course in udemy_courses.find():
    description=udemy_course['description']
    converter = html2text.HTML2Text()
    converter.ignore_links = True
    data = {
        "title": udemy_course['title'],
        "url": udemy_course['url'],
        "description": converter.handle(description)[76:],
        "syllabus": udemy_course['objectives'],
        "skills": udemy_course['title'],
        "ratings_count": udemy_course['num_reviews'],
        "prerequisites": udemy_course['prerequisites'],
        "duration": 'Not Available',
        "category": 'Course',
        "level": udemy_course['instructional_level_simple'],
        "schoolName": 'Provided by Udemy Instructors',
        "instructors": [item['display_name'] for item in udemy_course['visible_instructors']],
        "enrolled_students_count": udemy_course['num_subscribers'],
        "avg_rating": udemy_course['avg_rating'],
        "num_reviews": udemy_course['num_reviews'],
        "price": udemy_course['price'],
        "source": 'Udemy'
    }
    cleaned_courses_col.insert_one(data)

In [8]:
def _connect_mongo(host, port, username, password, db):
    if username and password:
        mongo_uri = 'mongodb://%s:%s@%s:%s/%s' % (username, password, host, port, db)
        conn = pymongo.MongoClient(mongo_uri)
    else:
        conn = pymongo.MongoClient(host, port)
    return conn[db]

In [9]:
def read_mongo(db, collection, query={}, host='localhost', port=27017, username=None, password=None, no_id=True):
    db = _connect_mongo(host=host, port=port, username=username, password=password, db=db)
    cursor = db[collection].find(query)
    df =  pd.DataFrame(list(cursor))
    if no_id:
        del df['_id']
    return df

In [41]:
df_clean_courses=read_mongo('courses','cleaned_courses')

In [42]:
df_clean_courses.sample(10)

Unnamed: 0,title,url,description,syllabus,skills,ratings_count,prerequisites,duration,category,level,schoolName,instructors,enrolled_students_count,avg_rating,num_reviews,price,source
2144,Trade with just 11 Candlesticks signals – Reap...,https://www.udemy.com/course/trade-with-just-1...,you often regret selling and buying stocks as ...,[•\tLearn how to identify a big trend reversal...,Trade with just 11 Candlesticks signals – Reap...,9,"[•\tEveryone is welcome, •\tYou must have the ...",Not Available,Course,All Levels,Provided by Udemy Instructors,[Dr Bishram],13,4.666666,9,$89.99,Udemy
9052,ADSENSE ARBITRAGE: ADSENSE TRAFFIC ARBITRAGE,https://www.udemy.com/course/adsense-traffic-a...,\nto make money online. You don't have to make...,[you can make a extra or full time earnings wi...,ADSENSE ARBITRAGE: ADSENSE TRAFFIC ARBITRAGE,199,"[Adsense Account, Wordpress Website, 5$ For Fa...",Not Available,Course,All Levels,Provided by Udemy Instructors,[Raj Kumar],1002,4.2,199,$24.99,Udemy
9906,Create Winning Online Marketing Campaigns In M...,https://www.udemy.com/course/create-winning-on...,"es, automatically optimized over the entire ca...","[online marketing, digital markeiting, Online ...",Create Winning Online Marketing Campaigns In M...,53,[No special requirements or experience needed],Not Available,Course,All Levels,Provided by Udemy Instructors,[Andre Bolte],6705,4.5,53,$19.99,Udemy
2710,Affiliate Marketing Secrets Set Up a Business ...,https://www.udemy.com/course/clickbank-affilia...,ion \n\n“ _ **Clickbank: Affiliate Marketing ...,"[Pick the most profitable niche., Setting up a...",Affiliate Marketing Secrets Set Up a Business ...,202,"[You need to have internet connection, Absolu...",Not Available,Course,All Levels,Provided by Udemy Instructors,[Scrembo Paul],12203,3.4,202,$19.99,Udemy
4451,Humanoid Rigging For Games Using Houdini And K...,https://www.udemy.com/course/humanoid-rigging-...,e\ncompatible with game engines like Unity and...,[We will create a basic humanoid rig in houdini.],Humanoid Rigging For Games Using Houdini And K...,15,[Students will need a basic understanding of t...,Not Available,Course,Intermediate,Provided by Udemy Instructors,[Simon Hayes],103,4.566667,15,$44.99,Udemy
5670,Educational Psychology & Special Education (Ce...,https://www.udemy.com/course/educational-psych...,"gnitive, emotional, and social\nleaning proces...",[This course focuses on teaching educational p...,Educational Psychology & Special Education (Ce...,410,[n/a],Not Available,Course,All Levels,Provided by Udemy Instructors,[Dr. Bev Knox],1416,4.449153,410,$89.99,Udemy
2888,MSP® Programme Management - Introduction,https://www.udemy.com/course/msp-programme-man...,"thod called MSP® Programme Management, a best ...",[Understand the key principles and terminology...,MSP® Programme Management - Introduction,3,"[Familiarity with projects, programmes and Pro...",Not Available,Course,Beginner,Provided by Udemy Instructors,[Stone River eLearning],11,4.0,3,$89.99,Udemy
5917,Quran Level 1 - Qaida Nuraniyah - Child Boy Ed...,https://www.udemy.com/course/nuraniyahboyedition/,\nis the:**\n\n * ** _EASIEST_ and the _FAST...,[The child will be able to make a smooth trans...,Quran Level 1 - Qaida Nuraniyah - Child Boy Ed...,8,[No prior knowledge of Arabic language required],Not Available,Course,Beginner,Provided by Udemy Instructors,[Hamid Raza],39,4.75,8,$39.99,Udemy
6558,Presentation Skills: Master Confident Presenta...,https://www.udemy.com/course/presentations-mas...,dent public speaker - overcome nerves and deli...,"[Be a memorable and confident speaker, Give a ...",Presentation Skills: Master Confident Presenta...,7074,"[Nothing - just a willingness to learn!, A tal...",Not Available,Course,All Levels,Provided by Udemy Instructors,[Chris Croft],19000,4.52462,7074,$89.99,Udemy
7621,ICS/SCADA Network Security Monitoring (NSM),https://www.udemy.com/course/icsscada-network-...,t\nmonitor and control industrial processes. T...,[They will learn how to apply open source tool...,ICS/SCADA Network Security Monitoring (NSM),139,[It would be beneficial if you took my first c...,Not Available,Course,Intermediate,Provided by Udemy Instructors,[Ed Galarza],771,4.0,139,$89.99,Udemy


##### Convert all prices to egp 

In [43]:
def clean_price_egp(x):
    return float(x.replace('$',''))*16

In [44]:
df_clean_courses.loc[df_clean_courses['source']=='Udemy', ['price']] = df_clean_courses[df_clean_courses['source']=='Udemy']['price'].apply(lambda x : clean_price_egp(x))

##### Take a look into categories and levels

In [46]:
df_clean_courses['category'].value_counts()

Course        10186
nanodegree      102
Name: category, dtype: int64

In [47]:
df_clean_courses['level'].value_counts()

All Levels      5371
Beginner        3137
Intermediate    1282
Expert           217
intermediate     161
beginner          71
advanced          46
                   1
Name: level, dtype: int64

In [58]:
df_clean_courses['source'].value_counts()

Udemy      10008
Udacity      276
Name: source, dtype: int64

##### Fixing levels as there is duplicates


In [49]:
df_clean_courses.loc[df_clean_courses['level']=='Beginner', ['level']]='beginner'
df_clean_courses.loc[df_clean_courses['level']=='Intermediate', ['level']]='intermediate'
df_clean_courses.loc[df_clean_courses['level']=='advanced', ['level']]='Expert'

In [50]:
df_clean_courses['level'].value_counts()

All Levels      5371
beginner        3208
intermediate    1443
Expert           263
                   1
Name: level, dtype: int64

In [61]:
df_clean_courses['schoolName'].value_counts()

Provided by Udemy Instructors          10008
School of Programming & Development      139
School of Artificial Intelligence         45
School of Data Science                    25
School of Business                        18
School of Product Management              15
Career Advancement                        11
School of Autonomous Systems              10
School of Cloud Computing                  9
School of Cybersecurity                    4
Name: schoolName, dtype: int64

In [66]:
df_clean_courses['duration'].value_counts()

Not Available    10008
4 Months            39
2 Months            39
3 Months            31
2 Weeks             28
3 Weeks             21
1 Week              18
1 Month             16
4 Weeks             12
16 Weeks             8
2 Days               7
6 Weeks              6
20 Hours             6
1 Day                5
5 Weeks              5
8 Weeks              5
                     4
6 Months             3
5 Months             3
7 Days               2
35 Hours             2
7 Weeks              2
21 Hours             2
10 Weeks             1
3 Days               1
28 Hours             1
160 Hours            1
6 Days               1
16 Hours             1
15 Hours             1
25 Hours             1
17 Hours             1
60 Hours             1
10 Hours             1
1 Hour               1
Name: duration, dtype: int64

##### remove na data from the dataset

In [59]:
df_clean_courses.dropna(inplace=True)

##### encode categorical values in the dataset

In [64]:
labelencoder = LabelEncoder()
df_clean_courses['level'] = labelencoder.fit_transform(df_clean_courses['level'])
df_clean_courses['schoolName'] = labelencoder.fit_transform(df_clean_courses['schoolName'])
df_clean_courses['category'] = labelencoder.fit_transform(df_clean_courses['category'])
df_clean_courses['source'] = labelencoder.fit_transform(df_clean_courses['source'])

In [65]:
df_clean_courses

Unnamed: 0,title,url,description,syllabus,skills,ratings_count,prerequisites,duration,category,level,schoolName,instructors,enrolled_students_count,avg_rating,num_reviews,price,source
0,Data Architect,https://www.udacity.com/course/data-architect-...,"Learn how to plan, design and implement enterp...",Not Avilable,"Entity relationship diagrams, Relational Data ...",68,"Intermediate Python, SQL, and Basics of ETL/Da...",4 Months,1,2,7,"[Ben Larson, Shankar Korrapolu, Shrinath Parik...",0,4.400000,68,21696,0
1,Natural Language Processing,https://www.udacity.com/course/natural-languag...,Learn the skills to get computers to understan...,Not Avilable,"Machine Learning,Speech Recognition,Sentiment ...",470,"Python, Statistics, Machine Learning, & Deep L...",3 Months,1,2,2,"[Luis Serrano, Jay Alammar, Arpan Chakraborty,...",0,4.520000,470,16272,0
2,AI Product Manager,https://www.udacity.com/course/ai-product-mana...,Learn to develop AI products that deliver busi...,Not Avilable,"AI Products,Training ML Models,Annotating Data...",513,No Experience Required,2 Months,1,3,8,"[Alyssa Simpson-Rochwerger, Meeta Dash, Kiran ...",0,4.340000,513,10848,0
3,Intro to Self-Driving Cars,https://www.udacity.com/course/intro-to-self-d...,This introductory program is the perfect way t...,Not Avilable,"Computer Vision,Machine Learning,Vehicle Motio...",743,Programming & Mathematics,4 Months,1,4,3,"[Sebastian Thrun, Andy Brown, Cezanne Camacho,...",0,4.650000,743,21696,0
4,Introduction to Cybersecurity,https://www.udacity.com/course/intro-to-cybers...,Get your start in the high growth field of cyb...,Not Avilable,"Threat Assessment,Security Vulnerabilities,Com...",96,Basic knowledge of network connectivity and OS...,4 Months,1,3,6,"[Ron Woerner, CISSP, CISM, Jerry Smith, Christ...",0,4.490000,96,21696,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10283,Complete RPG Maker MZ: Create and Publish for ...,https://www.udemy.com/course/complete-rpg-make...,e teach you how to use the power of RPG Maker\...,[Learn everything about RPG Maker MZ from begi...,Complete RPG Maker MZ: Create and Publish for ...,11,[RPG Maker MZ software that costs $80 on Steam...,Not Available,0,3,1,[Andres Cavallin],105,4.681818,11,959.84,1
10284,Create Your Empowering Feng Shui Vision Board,https://www.udemy.com/course/feng-shui-vision-...,Are you searching for purpose in life? Are you...,[Basics of Feng Shui: Understanding it's meani...,Create Your Empowering Feng Shui Vision Board,1,[No experience needed. Interest in spiritualit...,Not Available,0,1,1,"[Patricia ""Trish"" John]",8,4.000000,1,879.84,1
10285,CCNP ENTEPRRISE - ENARSI-300-410,https://www.udemy.com/course/ccnp-enarsi/,0\ngives you the knowledge you need to install...,"[Implement, Optimizing ,Troubleshooting EIGRP,...",CCNP ENTEPRRISE - ENARSI-300-410,47,[General understanding of network fundamentals...,Not Available,0,4,1,[sikandar Shaik],284,4.388889,47,479.84,1
10286,CSS3 Transition and Animation,https://www.udemy.com/course/css3-transition-a...,Keyframes\n\n * Animation\n\n * Animation I...,"[Keyframes, Animation, Animation Iteration, An...",CSS3 Transition and Animation,0,[This is a beginner to advance course — You do...,Not Available,0,1,1,[MD TAFSIRUL HAQUE DANISH],0,0.000000,0,319.84,1


In [83]:

train_df, val_df = train_test_split(df_clean_courses, test_size=0.3, random_state=42)

X_train = train_df[['ratings_count','category','level','schoolName','enrolled_students_count','avg_rating','num_reviews','source']]
y_train = train_df['price']

X_val = val_df[['ratings_count','category','level','schoolName','enrolled_students_count','avg_rating','num_reviews','source']]
y_val = val_df['price']

In [84]:
regressor = RandomForestRegressor(max_depth=2,random_state=0)
regressor = regressor.fit(X_train, y_train)

In [85]:
y_pred1=regressor.predict(X_train)

In [86]:
r2_score(y_train, y_pred1)

0.7970385095890317

In [87]:
y_pred2=regressor.predict(X_val)

In [88]:
y_pred2

array([1025.12266898,  757.04136668, 1015.84489947, ...,  780.62856816,
       1025.12266898,  872.65870201])

In [89]:
r2_score(y_val, y_pred2)

0.8253549052052905