In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from pandas.plotting import lag_plot
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa import stattools
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import random
import re
import copy

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 2000)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
 

Tech section

In [None]:
#
df_tech = pd.read_csv('/kaggle/input/2020-udemy-courses-dataset/udemy_tech.csv')
df_tech['Summary'] = df_tech['Summary'].fillna('')
print(df_tech.shape)
df_tech.info()


In [None]:
df_tech.sample(1)


NLP group courses into topics:

In [None]:
def get_topic_list(_df, n_components, stop_word_list):
    #
    df = _df.copy()
    #
    LDA = LatentDirichletAllocation(n_components=20,random_state=42)
    cv = CountVectorizer(max_df=0.6, min_df=3, stop_words='english')
    dtm_tech = cv.fit_transform(df[ 'Title']  + df[ 'Summary'] )
    LDA.fit(dtm_tech)
    print(len(cv.get_feature_names()), len(LDA.components_))
    #
    all_topic = []
    for index,topic in enumerate(LDA.components_):
        #print(f'TOP 15 WORDS FOR TOPIC #{index}')
        list = ([cv.get_feature_names()[i] for i in topic.argsort()[-5:]] )
        filtered_list = [x for x in list if x not in tech_stop_word_list]
        print('topic found: ', filtered_list)
        if filtered_list != None:
            all_topic.append(filtered_list)
            
    return all_topic


def get_topic_index(row, all_topic):
     
    wordlist_count = []
    for wordlist in all_topic:
        wordlist_re = r'\W(' + '|'.join(wordlist) + r')(s?)\W'
        matches = re.findall(wordlist_re, row , re.I)
        matched_words = [ match[0].lower() for match in matches]
        #print(matched_words, len(matched_words))
        wordlist_count.append(len(matched_words))
     
    max_index = wordlist_count.index(max(wordlist_count))
    return max_index


In [None]:
tech_stop_word_list = ['course','know','visual','services', 'start', 'project','basics', 'beginners', 'scratch','application', 'building','projects', 'world', 'real','learn','apps', 'app', 'getting', 'driven', 'development', 'started', 'need', 'code', 'coding', 'zero', 'using', 'time', 'step', 'real', 'learn','programming','easy','build', 'world', 'tool']
all_tech_topic = get_topic_list(df_tech, n_components=20, stop_word_list=tech_stop_word_list)


In [None]:
all_tech_topic  

By Topic Group: num of courses per topic group

In [None]:
df_tech['topic_group_index'] = (df_tech['Title'] + df_tech['Summary']).apply(get_topic_index, all_topic=all_tech_topic)
df_tech['topic_group'] = df_tech['topic_group_index'].map(lambda x : all_tech_topic[x])
df_tech['topic_group'] = df_tech['topic_group'].apply(lambda l : ', '.join(l)) 
 
df_sum_tech_topic_group = df_tech['topic_group'].value_counts().sort_values(ascending=False)

fig = plt.figure(figsize=(10,10))  
ax = fig.add_subplot(111)
df_sum_tech_topic_group.plot.barh(ax=ax).invert_yaxis()
    

By Topic Group: sum of enrollment per topic group

In [None]:
df_tech_sum_enrollment = df_tech.groupby('topic_group').Enrollment.sum().sort_values(ascending=False)

fig = plt.figure(figsize=(10,10))  
ax = fig.add_subplot(111)
df_tech_sum_enrollment.plot.barh(ax=ax).invert_yaxis()
    

In [None]:
plt.ax = plt.figure(figsize=(10,10))
sns.jointplot(x='Rating',y='Enrollment', data=df_tech.loc[(df_tech['Rating'] < 100000),:].sample(5000), kind='reg',ax=ax)


In [None]:
plt.ax = plt.figure(figsize=(10,20))
sns.jointplot(x='Stars',y='Enrollment', data=df_tech.loc[(df_tech['Enrollment'] < 100000),:], kind='reg',ax=ax)


In [None]:
plt.ax = plt.figure(figsize=(10,20))
sns.jointplot(x='Rating',y='Stars', data=df_tech, kind='reg',ax=ax)


In [None]:
df_tech[['Title','Enrollment']].sort_values(by='Enrollment', ascending=False).head(10)


In [None]:
df_tech[['Title','Rating']].sort_values(by='Rating', ascending=False).head(10)


In [None]:
df_tech[['Title','Stars']].sort_values(by='Stars', ascending=False).head(10)

Business Section

In [None]:
#
df_business = pd.read_csv('/kaggle/input/2020-udemy-courses-dataset/udemy_business.csv')
df_business['Summary'] = df_business['Summary'].fillna('')
print(df_business.shape)
df_business.info()


In [None]:
df_business.sample(1)
#df_business.isnull().sum()

In [None]:
business_stop_word_list = ['course','know','visual','services', 'start', 'project','basics', 'beginners', 'scratch','application', 'building','projects', 'world', 'real','learn','apps', 'app', 'getting', 'driven', 'development', 'started', 'need', 'code', 'coding', 'zero', 'using', 'time', 'step', 'real', 'learn','programming','easy','build', 'world', 'tool']
all_business_topic = get_topic_list(df_business, n_components=20, stop_word_list=business_stop_word_list)
    

In [None]:
df_business['topic_group_index'] = (df_business['Title'] + df_business['Summary']).apply(get_topic_index, all_topic=all_business_topic)
df_business['topic_group'] = df_business['topic_group_index'].map(lambda x : all_business_topic[x])
df_business['topic_group'] = df_business['topic_group'].apply(lambda l : ', '.join(l)) 
 
df_sum_business_topic_group = df_business['topic_group'].value_counts().sort_values(ascending=False)

fig = plt.figure(figsize=(10,10))  
ax = fig.add_subplot(111)
df_sum_business_topic_group.plot.barh(ax=ax).invert_yaxis()


    

In [None]:
df_business_sum_enrollment = df_business.groupby('topic_group').Enrollment.sum().sort_values(ascending=False)

fig = plt.figure(figsize=(10,10))  
ax = fig.add_subplot(111)
df_business_sum_enrollment.plot.barh(ax=ax).invert_yaxis()
    


Design Section

In [None]:
#
df_design = pd.read_csv('/kaggle/input/2020-udemy-courses-dataset/udemy_design.csv')
df_design['Summary'] = df_design['Summary'].fillna('')
print(df_design.shape)
df_design.info()


In [None]:
df_design.sample(1)
 

In [None]:
design_stop_word_list = ['course','know','visual','services', 'start', 'project','basics', 'beginners', 'scratch','application', 'building','projects', 'world', 'real','learn','apps', 'app', 'getting', 'driven', 'development', 'started', 'need', 'code', 'coding', 'zero', 'using', 'time', 'step', 'real', 'learn','programming','easy','build', 'world', 'tool']
all_design_topic = get_topic_list(df_design, n_components=20, stop_word_list=design_stop_word_list)
   

In [None]:
df_design['topic_group_index'] = (df_design['Title'] + df_design['Summary']).apply(get_topic_index, all_topic=all_design_topic)
df_design['topic_group'] = df_design['topic_group_index'].map(lambda x : all_design_topic[x])
df_design['topic_group'] = df_design['topic_group'].apply(lambda l : ', '.join(l)) 
 
df_sum_design_topic_group = df_design['topic_group'].value_counts().sort_values(ascending=False)

fig = plt.figure(figsize=(10,10))  
ax = fig.add_subplot(111)
df_sum_design_topic_group.plot.barh(ax=ax).invert_yaxis()


In [None]:
df_design_sum_enrollment = df_design.groupby('topic_group').Enrollment.sum().sort_values(ascending=False)

fig = plt.figure(figsize=(10,10))  
ax = fig.add_subplot(111)
df_design_sum_enrollment.plot.barh(ax=ax).invert_yaxis()
    
    

marketing section

In [None]:
#
df_marketing = pd.read_csv('/kaggle/input/2020-udemy-courses-dataset/udemy_marketing.csv')
df_marketing['Summary'] = df_marketing['Summary'].fillna('')
print(df_marketing.shape)
df_marketing.info()


In [None]:
df_marketing.sample(1)


In [None]:
marketing_stop_word_list = ['course','know','visual','services', 'start', 'project','basics', 'beginners', 'scratch','application', 'building','projects', 'world', 'real','learn','apps', 'app', 'getting', 'driven', 'development', 'started', 'need', 'code', 'coding', 'zero', 'using', 'time', 'step', 'real', 'learn','programming','easy','build', 'world', 'tool']
all_marketing_topic = get_topic_list(df_marketing, n_components=20, stop_word_list=marketing_stop_word_list)
  

In [None]:
df_marketing['topic_group_index'] = (df_marketing['Title'] + df_marketing['Summary']).apply(get_topic_index, all_topic=all_marketing_topic)
df_marketing['topic_group'] = df_marketing['topic_group_index'].map(lambda x : all_marketing_topic[x])
df_marketing['topic_group'] = df_marketing['topic_group'].apply(lambda l : ', '.join(l)) 
 
df_sum_marketing_topic_group = df_marketing['topic_group'].value_counts().sort_values(ascending=False)

fig = plt.figure(figsize=(10,10))  
ax = fig.add_subplot(111)
df_sum_marketing_topic_group.plot.barh(ax=ax).invert_yaxis()


In [None]:
df_marketing_sum_enrollment = df_marketing.groupby('topic_group').Enrollment.sum().sort_values(ascending=False)

fig = plt.figure(figsize=(10,10))  
ax = fig.add_subplot(111)
df_marketing_sum_enrollment.plot.barh(ax=ax).invert_yaxis()


Finance Section

In [None]:
#
df_finance = pd.read_csv('/kaggle/input/2020-udemy-courses-dataset/udemy_finance.csv')
df_finance['Summary'] = df_finance['Summary'].fillna('')
print(df_finance.shape)
df_finance.info()


In [None]:
df_finance.sample(1)


In [None]:
finance_stop_word_list = ['course','know','visual','services', 'start', 'project','basics', 'beginners', 'scratch','application', 'building','projects', 'world', 'real','learn','apps', 'app', 'getting', 'driven', 'development', 'started', 'need', 'code', 'coding', 'zero', 'using', 'time', 'step', 'real', 'learn','programming','easy','build', 'world', 'tool']
all_finance_topic = get_topic_list(df_finance, n_components=20, stop_word_list=finance_stop_word_list)
  

In [None]:
df_finance['topic_group_index'] = (df_finance['Title'] + df_finance['Summary']).apply(get_topic_index, all_topic=all_finance_topic)
df_finance['topic_group'] = df_finance['topic_group_index'].map(lambda x : all_finance_topic[x])
df_finance['topic_group'] = df_finance['topic_group'].apply(lambda l : ', '.join(l)) 
 
df_sum_finance_topic_group = df_finance['topic_group'].value_counts().sort_values(ascending=False)

fig = plt.figure(figsize=(10,10))  
ax = fig.add_subplot(111)
df_sum_finance_topic_group.plot.barh(ax=ax).invert_yaxis()


In [None]:
df_finance_sum_enrollment = df_finance.groupby('topic_group').Enrollment.sum().sort_values(ascending=False)

fig = plt.figure(figsize=(10,10))  
ax = fig.add_subplot(111)
df_finance_sum_enrollment.plot.barh(ax=ax).invert_yaxis()
