In [None]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd 
import seaborn as sns
import plotly.express as px 
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv('../input/udemy-courses/udemy_courses.csv')

In [None]:
df.drop(['course_id', 'url'],axis = 1, inplace = True)
df.head().transpose()

In [None]:
new_df = df['published_timestamp'].str.split('-',expand=True).loc[:,[0,1]].astype('int')
new_df.columns=['year', 'month']
data = pd.concat([df,new_df],axis=1)
data.drop(['published_timestamp'], axis = 1, inplace = True)
data.head()

As it can be seen, there are no null values in the data:

In [None]:
data.describe()

# Categorical data exploration

In [None]:
temp_df = pd.DataFrame(data['subject'].value_counts()).reset_index()
fig = px.pie(temp_df, values='subject', names= 'index', \
             title='Subject category distribution',color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

In [None]:
temp_df = pd.DataFrame(data['level'].value_counts()).reset_index()
fig = px.pie(temp_df, values='level', names= 'index', \
             title='Level category distribution',color_discrete_sequence=px.colors.sequential.RdBu)
fig.show()

As it can be seen, courses of categories 'Web Development' and 'Business Finance' prevail.
With respect to 'level' category - 'all levels' courses are more than of half of the dataset.

# Number of courses throughout the years

In [None]:
plot_data = data.groupby(['year', 'subject'])['year'].count().to_frame()
plot_data['subject'] = [ x[1] for x in plot_data.index.tolist()]
plot_data.index = [x[0] for x in plot_data.index.tolist()]
plot_data['time'] = plot_data.index
plot_data = plot_data.reset_index(drop=True)
sns.catplot(x = 'time', y="year", hue="subject",palette = sns.color_palette("magma"), kind="point",data=plot_data)
plt.show()

Peak of quantity of 'Web development' courses is higher than 400, which is an absolute record of the observed pattern throughout all years!

All in all, in 2016 the quantity of each type courses achived a peak, but the 'Web development' and 'Business Finance' categories clearly prevail in that period by a wide margin.

# Clustering

The common pattern and structure of the data can be observed through clutering and analysis of the clusters gained, three features for more convenient visualization were chosen, which are: number of lectures, subscribers and reviews.

In [None]:
chosen_features = ['num_lectures', 'num_subscribers', 'num_reviews']
x = data.loc[:, chosen_features].values
x_ave = np.mean(x, axis=0)
x_rng = np.ptp(x, axis=0)
x_rng_std = np.divide(np.subtract(x, x_ave), x_rng)
kmeans = KMeans(n_clusters=4, init = 'random', algorithm = 'elkan').fit(x_rng_std)
x = x_rng_std[:, [0]].reshape(-1,)
y = x_rng_std[:, [1]].reshape(-1,)
z = x_rng_std[:, [2]].reshape(-1,)
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
data['labels'] = kmeans.labels_
ax.scatter(x, y, z, c = kmeans.labels_, s = 70)
plt.title('Clusters of courses')
plt.show()

Clusters coloring w.r.t. subject

In [None]:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, c = data['subject'].map({k:i for i,k in enumerate(data['subject'].unique().tolist())}), s = 70)
plt.show()

Well, obviously, the clusters cannot be interpreted with respect to neither 'subject', nor 'level' category.

In [None]:
for i in sorted(data['labels'].unique()):
    vals = data[data['labels'] == i]
    vals = vals.loc[:, chosen_features].values
    print(f'Centers of the cluster {i+1}')
    print(*chosen_features)
    print(*(np.mean(vals, axis = 0).round(3)), sep = '  ')
    print()

Clusters can be interpreted with respect to popularity - number of subscribers and reviews, also, it may be noticed that not the most expensive courses are the most popular ones.

# Word clouds of 'Web Development' and 'Musical Instruments' courses with respect to 'Level' category

In [None]:
level_cats = {i for i in data['level'].unique()}
print(f'{len(level_cats)} word clouds for the following levels: {level_cats}')

## Web Development courses

In [None]:
for i in level_cats:
    text = data[data['level'] == i]
    text =  ''.join(text[text['subject'] == 'Web Development']['course_title'])
    wordcloud = WordCloud(stopwords=STOPWORDS, background_color="white", max_words=1000).generate(text)
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud)
    plt.axis("off") 
    plt.show()

## Musical Instruments courses

In [None]:
for i in level_cats:
    text = data[data['level'] == i]
    text =  ''.join(text[text['subject'] == 'Musical Instruments']['course_title'])
    wordcloud = WordCloud(stopwords=STOPWORDS, background_color="white", max_words=1000).generate(text)
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud)
    plt.axis("off") 
    plt.show()