In [682]:
import pandas as pd

import matplotlib.pyplot as plt

import plotly.express as px
import plotly.offline as plyo

In [None]:
nltk.download('punkt')

In [634]:
# Reading all courses data
all_courses_df = pd.read_csv('./all_courses_data.csv')

### 1. Word Frequency: Given the text description of the courses for the Web Development, we want to analyze the frequency of the top words 

In [None]:
import re

import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [670]:
import bokeh.io
from bokeh.plotting import figure, show
from bokeh.models import Slider, ColumnDataSource, Dropdown, CustomJS, CDSView
from bokeh.layouts import Column

In [635]:
course_text_df = all_courses_df[['course_id', 'course_title','course_details']].copy()

In [636]:
course_text_df.head()

Unnamed: 0,course_id,course_title,course_details
0,ps_1,Web Development: Executive Briefing,Tech leaders need a fundamental understanding ...
1,ps_2,Front End Web Development: Get Started,Front end web development involves many differ...
2,ps_3,Beyond ASP.NET MVC: Modern Web Development Dem...,The web development landscape is constantly ch...
3,ps_4,Tactics and Tools for Troubleshooting Front-en...,At the core of any fully responsive website is...
4,ps_5,Developing Web Applications and Web APIs Prote...,A large percentage of applications are accesse...


In [637]:
course_text_df['combined_tags'] = course_text_df['course_title'] + " " + course_text_df['course_details']

In [638]:
# Removing non-ascii characters
course_text_df = course_text_df[course_text_df['course_title'].apply(lambda x: all(ord(char) < 128 for char in x))]

In [639]:
course_text_df.reset_index(drop=True, inplace=True) 

In [640]:
# Define text preprocessing function
def preprocess_text(text):
    # Remove non-alphabetic characters and lowercase the text
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    
    # Initialize stemmer and lemmatizer
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    # Remove stopwords, perform stemming, and lemmatization
    stop_words = set(stopwords.words('english'))
    # Using both stemmer and lemmatizer as some words like explore won't work correctly for lemmatizer but works well for stemmer and vice-versa
    tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens if word not in stop_words]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [641]:
# Apply text preprocessing to the 'combined_tags' column
course_text_df['combined_tags_preprocessed'] = course_text_df['combined_tags'].apply(preprocess_text)

In [642]:
tokens = [word.lower() for desc in course_text_df['combined_tags_preprocessed'] for word in word_tokenize(desc)]


In [643]:
# Calculate word frequencies
freq_dist = FreqDist(tokens)

In [644]:
freq_df = pd.DataFrame.from_dict(dict(freq_dist), orient='index', columns=['Frequency'])
freq_df.index.name = 'Word'
freq_df = freq_df.sort_values(by='Frequency', ascending=False)
freq_df.reset_index(inplace=True)

In [645]:
freq_dict = freq_df.to_dict(orient='list')

In [646]:
source = ColumnDataSource(freq_dict)

In [647]:
plot = figure(  x_range=[], height=500, width = 1000, title="Frequency Distribution of Words",
                x_axis_label="Words", y_axis_label="Frequency")

plot.x_range.factors = freq_dict['Word'][0:5]
plot.xaxis.major_label_orientation = -0.3 * 3.14
plot.xaxis.major_label_text_font_size = "12pt"
plot.xaxis.axis_label_text_font_style = 'bold'
plot.yaxis.axis_label_text_font_style = 'bold'

plot.vbar(x='Word', top='Frequency', width=0.8, source=source)

In [649]:
# Add sliders
slider = Slider(start=0, end=len(freq_dict['Word'])-50, value=0,
                             step=20, title="Range starting at: ", width=800)
range_slider = Slider(start=5, end=50, value=5, step=1, title="Number of words: ")

# Update plot based on slider values
callback = CustomJS(args={'figure': plot, 'source': source, 'slider': slider, 'range_slider': range_slider, 'freq_data': freq_dict},
                                 code="""
    const start_value = slider.value;
    const range_value = range_slider.value;
    
    const slicedData = { Word: freq_data['Word'].slice(start_value, start_value+range_value), Frequency: freq_data['Frequency'].slice(start_value, start_value+range_value) };
    figure.x_range.factors = freq_data['Word'].slice(start_value, start_value+range_value);

    source.data = slicedData
    source.change.emit();
""")

slider.js_on_change('value', callback)
range_slider.js_on_change('value', callback)

In [678]:
layout = Column(plot, slider, range_slider)

In [679]:
show(layout)

In [652]:
bokeh.io.output_file('word_count_bar_plot.html')

### RQ 2. Vizualize the course level per course site to see the distribution of course level in all the 3 platforms

In [415]:
from math import pi
from bokeh.transform import cumsum

In [687]:
course_level_info = all_courses_df[['course_id', 'course_level']].copy()

In [688]:
course_level_info['course_level_name'] = course_level_info['course_level'].map({0: 'All', 1: 'Beginner', 2: 'Intermediate', 3: 'Advanced'})

In [689]:
course_level_info['site_name'] = course_level_info['course_id'].str.split('_').str[0].map({'ud': 'Udemy', 'ce': 'Coursera', 'ps': 'Pluralsight'})

In [690]:
course_level_info = course_level_info.drop(columns=['course_id', 'course_level'])

In [691]:
course_level_count = course_level_info.groupby(['site_name', 'course_level_name']).size().reset_index(name='count')

In [692]:
course_level_count.loc[len(course_level_count)] = {'site_name': 'Pluralsight', 'course_level_name': 'All', 'count': 0}

In [693]:
course_level_count

Unnamed: 0,site_name,course_level_name,count
0,Coursera,Advanced,22
1,Coursera,All,98
2,Coursera,Beginner,291
3,Coursera,Intermediate,279
4,Pluralsight,Advanced,11
5,Pluralsight,Beginner,81
6,Pluralsight,Intermediate,99
7,Udemy,Advanced,54
8,Udemy,All,2873
9,Udemy,Beginner,1888


In [661]:
course_level_count['angle'] = course_level_count['count'] / course_level_count.groupby('site_name')['count'].transform('sum') * 2*pi

In [662]:
course_level_count

Unnamed: 0,site_name,course_level_name,count,angle
0,Coursera,Advanced,21,0.22555
1,Coursera,All,84,0.902201
2,Coursera,Beginner,250,2.685122
3,Coursera,Intermediate,230,2.470312
4,Pluralsight,Advanced,11,0.365688
5,Pluralsight,Beginner,80,2.659549
6,Pluralsight,Intermediate,98,3.257948
7,Udemy,Advanced,55,0.062053
8,Udemy,All,2830,3.192928
9,Udemy,Beginner,1860,2.098532


In [663]:
colors = {
    "Advanced": "tomato",
    "All": "orchid",
    "Beginner": 'yellow',
    "Intermediate": "orange"
}
course_level_count['color'] = course_level_count['course_level_name'].apply(lambda name:colors[name])

In [664]:
source = ColumnDataSource(course_level_count)

In [671]:
p = figure(height=400, title="Course level pie Chart: Udemy", toolbar_location=None, tools="hover", tooltips="@course_level_name: @count", x_range=(-0.5, 1.0))

view = CDSView( filter=bokeh.models.GroupFilter(column_name='site_name', group='Udemy'))

In [672]:
p.wedge(x=0, y=1, radius=0.4, 
            start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
            line_color="white", fill_color='color', legend_field='course_level_name', source=source, view=view)

In [673]:
menu = [("Udemy", "Udemy"), ("Coursera", "Coursera"), ("Pluralsight", "Pluralsight")]

In [674]:
dropdown = Dropdown(label="Udemy", button_type="primary", menu=menu)

In [675]:
callback = CustomJS(args = {"source": source, "view": view, "figure": p, 'dropdown': dropdown}, code="""
    // Update the filters in the view
    console.log(view)
    console.log(figure)
    console.log(source)
    var site = this.item;
    this.origin.label = site;
    view.filters[0].group = site;
    figure.title.text = 'Course level pie Chart: ' + site;
    source.change.emit();
""")
dropdown.js_on_event('menu_item_click', callback)

In [680]:
p.axis.axis_label = None
p.axis.visible = False
p.grid.grid_line_color = None

layout = Column(dropdown,p)
show(layout)

#### RQ 3. Vizualize the scatter plots to see the popularity in terms of people rating and highest levels of ratings
- Scatter plot between course rating and number of reviews

In [694]:
rating_df  = all_courses_df[['course_id', 'course_rating', 'course_no_of_reviews']].copy()

In [695]:
rating_df['Platform'] = rating_df['course_id'].str.split('_').str[0].map({'ud': 'Udemy', 'ce': 'Coursera', 'ps': 'Pluralsight'})

In [697]:
len(rating_df)

6485

In [698]:
rating_df = rating_df.dropna(subset=['course_rating'])

In [699]:
rating_df = rating_df.dropna(subset=['course_no_of_reviews'])

In [700]:
rating_df = rating_df.dropna(subset=['course_rating', 'course_no_of_reviews'])
rating_df = rating_df[(rating_df['course_rating'] != 0) & (rating_df['course_no_of_reviews'] != 0)]

In [702]:
len(rating_df)

5980

In [703]:
rating_df = rating_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [704]:
# Scatter plot between course rating and number of reviews by site
fig = px.scatter(rating_df, x='course_rating', y='course_no_of_reviews', color='Platform', 
                 title='Course Rating vs Number of Reviews by Site',
                 labels={'course_rating': 'Course Rating', 'course_no_of_reviews': 'Number of Reviews', 'site': 'Site'})
fig.show()

In [685]:
plyo.plot(fig, filename='rating_v_reviews_plot.html')

'rating_v_reviews_plot.html'

In [625]:
# Group the data by site
grouped_data = rating_df.groupby('Platform')

# Calculate mean, median, etc. for each site
summary_stats = grouped_data['course_rating'].agg(['mean', 'median', 'std', 'min', 'max'])

# Print the summary statistics
print("Summary Statistics for course per Site:")
print(summary_stats)

Summary Statistics for course per Site:
                 mean    median       std  min  max
Platform                                           
Coursera     4.539254  4.600000  0.315418  2.8  5.0
Pluralsight  4.445578  4.500000  0.392783  2.7  5.0
Udemy        4.228626  4.325498  0.511352  1.0  5.0


In [630]:
def calculate_metrics(data):
    # Fraction of courses with rating below 4 and above 4
    total_courses = len(data)
    below_4 = len(data[data['course_rating'] < 4])
    above_4 = len(data[data['course_rating'] >= 4])
    fraction_below_4 = below_4 / total_courses
    fraction_above_4 = above_4 / total_courses
    
    # Mean and median of number of reviews for courses with rating below 4
    mean_reviews_below_4 = data[data['course_rating'] < 4]['course_no_of_reviews'].mean()
    median_reviews_below_4 = data[data['course_rating'] < 4]['course_no_of_reviews'].median()
    
    # Mean and median of number of reviews for courses with rating above 4
    mean_reviews_above_4 = data[data['course_rating'] >= 4]['course_no_of_reviews'].mean()
    median_reviews_above_4 = data[data['course_rating'] >= 4]['course_no_of_reviews'].median()
    
    return fraction_below_4, fraction_above_4, mean_reviews_below_4, median_reviews_below_4, mean_reviews_above_4, median_reviews_above_4
    

In [631]:
# Iterate over each platform
platforms = ['ud', 'ce', 'ps']
for platform in platforms:
    # Filter dataframe for the current platform
    platform_data = rating_df[rating_df['course_id'].str.startswith(platform)]
    
    # Calculate metrics
    fraction_below_4, fraction_above_4, mean_reviews_below_4, median_reviews_below_4, mean_reviews_above_4, median_reviews_above_4 = calculate_metrics(platform_data)
    
    # Print results
    print(f'Platform: {platform.upper()}')
    print(f'Fraction of courses with rating below 4: {fraction_below_4:.2f}')
    print(f'Fraction of courses with rating above 4: {fraction_above_4:.2f}')
    print(f'Mean number of reviews for courses with rating below 4: {mean_reviews_below_4:.2f}')
    print(f'Median number of reviews for courses with rating below 4: {median_reviews_below_4:.2f}')
    print(f'Mean number of reviews for courses with rating above 4: {mean_reviews_above_4:.2f}')
    print(f'Median number of reviews for courses with rating above 4: {median_reviews_above_4:.2f}')
    print()

Platform: UD
Fraction of courses with rating below 4: 0.26
Fraction of courses with rating above 4: 0.74
Mean number of reviews for courses with rating below 4: 149.39
Median number of reviews for courses with rating below 4: 43.00
Mean number of reviews for courses with rating above 4: 1153.12
Median number of reviews for courses with rating above 4: 73.00

Platform: CE
Fraction of courses with rating below 4: 0.06
Fraction of courses with rating above 4: 0.94
Mean number of reviews for courses with rating below 4: 114.00
Median number of reviews for courses with rating below 4: 26.50
Mean number of reviews for courses with rating above 4: 1246.23
Median number of reviews for courses with rating above 4: 175.00

Platform: PS
Fraction of courses with rating below 4: 0.10
Fraction of courses with rating above 4: 0.90
Mean number of reviews for courses with rating below 4: 54.33
Median number of reviews for courses with rating below 4: 36.00
Mean number of reviews for courses with rating

#### Though as per the mean, median, std, min and max, the site coursera has highest rating followed by Plural sight and atlast Udemy. But as per the scatter plot, when compared with the Number of Reviews Udemy is the best platform followed by Coursera and at last Pluralsight

#### RQ 4. Vizualize the histogram of course duration with respect to count of courses
- Box plot and histogram to see the distribution

In [705]:
course_duration_df = all_courses_df[['course_id', 'course_duration']].copy()

In [706]:
course_duration_df['Platform'] = course_duration_df['course_id'].str.split('_').str[0].map({'ud': 'Udemy', 'ce': 'Coursera', 'ps': 'Pluralsight'})

In [707]:
df = df.dropna(subset=['course_duration'])

In [710]:
# Histogram of course durations for Udemy
udemy_df = course_duration_df[course_duration_df['Platform'] == 'Udemy']
fig_udemy = px.histogram(udemy_df, x='course_duration', 
                         title='Udemy: Distribution of Course Durations',
                         labels={'course_duration': 'Course Duration (Hours)', 'count': 'Frequency'})
fig_udemy.show()

# Histogram of course durations for Coursera
coursera_df = course_duration_df[course_duration_df['Platform'] == 'Coursera']
fig_coursera = px.histogram(coursera_df, x='course_duration', 
                            title='Coursera: Distribution of Course Durations',
                            labels={'course_duration': 'Course Duration (Hours)', 'count': 'Frequency'},
                           color_discrete_sequence=['orange'])
fig_coursera.show()

# Histogram of course durations for PluralSight
pluralsight_df = course_duration_df[course_duration_df['Platform'] == 'Pluralsight']
fig_pluralsight = px.histogram(pluralsight_df, x='course_duration', 
                               title='Pluralsight: Distribution of Course Durations',
                               labels={'course_duration': 'Course Duration (Hours)', 'count': 'Frequency'},
                              color_discrete_sequence=['green'])
fig_pluralsight.show()


In [711]:
# Box plot of course ratings for all three sites
fig = px.box(course_duration_df, x='Platform', y='course_duration',
             title='Distribution of Course Ratings by Site',
             labels={'site': 'Site', 'course_rating': 'Course Rating'})
fig.show()

In [712]:
# Group the data by site
grouped_data = course_duration_df.groupby('Platform')

# Calculate mean, median, etc. for each site
summary_stats = grouped_data['course_duration'].agg(['mean', 'median', 'std', 'min', 'max'])

# Print the summary statistics
print("Summary Statistics for course per Site:")
print(summary_stats)

Summary Statistics for course per Site:
                  mean     median        std       min         max
Platform                                                          
Coursera     16.588406  14.000000  12.958296  0.000000  133.000000
Pluralsight   2.272875   1.868889   1.454403  0.090000    9.105278
Udemy         8.145170   4.500000  11.378831  0.033333  141.000000


#### RQ 5. Bubble chart showing the relationship between course rating, duration, and number of enrollments 

In [714]:
df = all_courses_df[['course_id', 'course_title', 'course_duration', 'course_rating', 'course_no_of_reviews', 'course_no_of_enrolled']].copy()

In [715]:
df['Platform'] = df['course_id'].str.split('_').str[0].map({'ud': 'Udemy', 'ce': 'Coursera', 'ps': 'Pluralsight'})

In [716]:
df = df.dropna(subset=['course_no_of_reviews', 'course_no_of_enrolled', 'course_duration'])

In [717]:

# Create a bubble chart
udemy_df = df[df['Platform'] == 'Udemy']
fig = px.scatter(udemy_df, x='course_rating', y='course_duration', size='course_no_of_reviews', 
                 color='course_no_of_enrolled', 
                 title='Udemy: Relationship between Course Rating, Duration, and Enrollments',
                 labels={'course_rating': 'Course Rating', 'course_duration': 'Course Duration', 
                         'course_no_of_reviews': 'Number of Reviews', 'course_no_of_enrolled': 'Number of Enrollments'},
                 hover_name='course_title', size_max=50)
fig.show()

# Create a bubble chart
coursera_df = df[df['Platform'] == 'Coursera']
fig = px.scatter(coursera_df, x='course_rating', y='course_duration', size='course_no_of_reviews', 
                 color='course_no_of_enrolled', 
                 title='Coursera: Relationship between Course Rating, Duration, and Enrollments',
                 labels={'course_rating': 'Course Rating', 'course_duration': 'Course Duration', 
                         'course_no_of_reviews': 'Number of Reviews', 'course_no_of_enrolled': 'Number of Enrollments'},
                 hover_name='course_title', size_max=50)
fig.show()
