# EDA of Coursera dataset & similarity metrics on text data from scratch

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



In [None]:
# importing the necessary libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Reading the Coursera Dataset

In [None]:
df=pd.read_csv('../input/coursera-course-dataset/coursea_data.csv')

In [None]:
# Coursera dataset is a dataset having information about Coursera courses,
# containing attributes such as course title, course organization, course rating,
# course certificate type, course difficulty and number of students enrolled in a course

df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
# Removing unnamed column
df=df.drop(['Unnamed: 0'], axis=1)

In [None]:
df.head()

# Exploring the Dataset

In [None]:
#checking if there are any null values
df.isnull().values.any()

In [None]:
# finding number of unique courses
print("No. of unique courses:",df['course_title'].nunique())

In [None]:
# finding number of unique course organizations
print("No. of unique course organizations:",df['course_organization'].nunique())

In [None]:
# Visualizing the frequency of courses in each category of course difficulty attribute

sns.countplot(x='course_difficulty',data=df)

plt.title('Count plot to visualize count of each category of course difficulty\n')

In [None]:
# We can observe that the highest number of courses are present in the beginner category
# A considerable number of courses lie in the Intermediate and mixed category
# and there are least number of courses in the advanced category


In [None]:
# Visualizing the frequency of courses in each category of course certificate type attribute

sns.countplot(x='course_Certificate_type',data=df)

plt.title('Count plot to visualize count of each category of course certificate type\n')

In [None]:
# We can observe that most courses have the general course Certificate type
# and there are almost 300 courses having a specialization certificate
# whereas, there are very few courses granting a professional certificate


In [None]:
#Visualizing the distribution of courses based on the course rating attribute

sns.countplot(x='course_rating',data=df)

plt.title('Count plot for count of courses based on the course rating attribute\n')

In [None]:
# We can observe that most courses have ratings in the range: 4.6 to 4.8
# with almost 250 courses having rating equal to 4.8


In [None]:
# Displaying the top rated courses and their details

df.sort_values('course_rating',ascending=False).head(7)

In [None]:
def convert_num(stud):
    k=stud.find('k')
    if k==-1:
        m=stud.find('m')
        num=float(stud[0:m])
        n=num*(10**6)
        
    else:
        num=float(stud[0:k])
        n=num*(10**3)
        
    return int(n)
    
df['course_students_enrolled']=df['course_students_enrolled'].apply(convert_num)

In [None]:
df['course_students_enrolled'].head()

In [None]:
# Displaying most popular courses i.e courses having most number of students enrolled

df.sort_values('course_students_enrolled',ascending=False).head()

#  **Finding Similarity metrics on text data**

In [None]:
# Taking attribute: course_title, using similarity measures to find similar courses which can be used
# to recommend courses to a student i.e finding distances between rows (each object)
df['course_title'].head(10)

In [None]:
#taking n=100
s=df['course_title'][:100]


# **1. Edit Distance**

In [None]:
def edit_dist(s1,s2):
    l1=len(s1)
    l2=len(s2)
    dist=[[0 for i in range(l2+1)] for j in range(l1+1)]
    
    for i in range(l1+1):
        for j in range(l2+1):
            if i==0:
                dist[i][j]=j
            elif j==0:
                dist[i][j]=i
            
            #if last letter of the strings matches:
            elif s1[i-1]==s2[j-1]:
                dist[i][j]=dist[i-1][j-1]  #value=diagonal element
            else:
                dist[i][j] = 1 + min(dist[i-1][j],dist[i][j-1],dist[i-1][j-1])
        

     
    return dist[l1][l2]


In [None]:
# Initializing Edit distance matrix
n=100
dist_mat=np.zeros((n,n),dtype=int)
print("Shape of distance matrix:",dist_mat.shape)

In [None]:
def edit_dist_all(x):
    for i in range(len(x)):
        xo=x[i]
        
        for j in range(len(x)):
            dist_mat[i][j]=edit_dist(xo,x[j])
    return dist_mat

dist_mat=edit_dist_all(s)
print("Edit distance matrix:\n\n",dist_mat)

# **2. LCS**

In [None]:
def lcs(s1,s2):
    l1=len(s1)
    l2=len(s2)
    dist=[[0 for i in range(l2+1)] for j in range(l1+1)]
    
    for i in range(l1+1):
        for j in range(l2+1):
            if i==0:
                dist[i][j]=0
            elif j==0:
                dist[i][j]=0
            
            #if last letter of the strings matches:
            elif s1[i-1]==s2[j-1]:
                dist[i][j]=1+dist[i-1][j-1]  #value=diagonal element
            else:
                dist[i][j] = max(dist[i-1][j],dist[i][j-1])
        
  
     
    return dist[l1][l2]

In [None]:
# Initializing LCS distance matrix
n=100
dist_lcs=np.zeros((n,n),dtype=int)
print("Shape of distance matrix:",dist_lcs.shape)

In [None]:
def lcs_dist_all(x):
    for i in range(len(x)):
        xo=x[i]
        
        for j in range(len(x)):
            dist_lcs[i][j]=lcs(xo,x[j])
    return dist_lcs

dist_lcs=lcs_dist_all(s)
print("LCS distance matrix:\n\n",dist_lcs)

# **3. Dice n-gram matching**

In [None]:
def dice_ngram(s1,s2):
    m=len(s1)
    n=len(s2)
    s=""
    ngram1=[]
    for i in range(m-1):
        s=s1[i]
        s+=s1[i+1]
        ngram1.append(s)
        s=""
        
#     print("\nString 1: {}\nN-grams are: {}".format(s1,ngram1))
#     print("\nNumber of n-grams of {}: {}".format(s1,len(ngram1)))
    
    a=""
    ngram2=[]
    for i in range(n-1):
        a=s2[i]
        a+=s2[i+1]
        ngram2.append(a)
        a=""
        
#     print("\nString 2: {}\nN-grams are: {}".format(s2,ngram2))
#     print("\nNumber of n-grams of {}: {}".format(s2,len(ngram2)))
    
    ngram=0
    common=[]
    for i in range(len(ngram1)):
        if ngram1[i] in ngram2:
            common.append(ngram1[i])
            ngram+=1
            
#     print("\nCommon n-grams of {} and {}: {}".format(s1,s2,common))
#     print("\nNumber of Common n-grams of {} and {}: {}".format(s1,s2,ngram))
    
    n1=len(ngram1)
    n2=len(ngram2)
    
    d=2*ngram/(n1+n2)
#     print("\nDice Coefficient:",d)
    return d



In [None]:
# Initializing dice n-gram distance matrix
n=100
dist_ngram=np.zeros((n,n))
print("Shape of distance matrix:",dist_ngram.shape)

In [None]:
def ngram_dist_all(x):
    for i in range(len(x)):
        xo=x[i]
        
        for j in range(len(x)):
            dist_ngram[i][j]=round(dice_ngram(xo,x[j]),2)
    return dist_ngram

dist_ngram=ngram_dist_all(s)
print("Dice n-gram distance matrix:\n\n",dist_ngram)

# **4. Jaro distance**

In [None]:
import math
def jaro_distance(a1,a2):
    #if strings are equal, jaro distance is zero
    if a1==a2:
        jaro_dist=0
        return jaro_dist
    else:
        s1=len(a1)
        s2=len(a2)

        #finding the max limit to ensure the 2 letters aren't farther apart than required condition
        max_limit=math.floor(max(s1,s2)/2) -1
#         print("\nMax limit:",max_limit)

        m=0
        match_char=[]

        hash_a1=[0]*s1
        hash_a2=[0]*s2

        #finding number of matching characters 
        for i in range(s1):
            for j in range(max(0, i - max_limit),min(s2, i + max_limit + 1)):

                if (a1[i] == a2[j] and hash_a2[j] == 0):
                    hash_a1[i] = 1
                    hash_a2[j] = 1
                    m += 1
                    match_char.append(a1[i])
                    break

#         print("\nNumber of matching characters (m):",m)
#         print("Characters:",match_char)
        
        if m==0:
            jaro_sim=0
            jaro_dist=1-jaro_sim
            return jaro_dist
        
        else:

            #retrieving sequence of matching characters from both the strings
            l1=""
            for i in range(s1):
                if a1[i] in match_char and hash_a1[i]==1:
                    l1+=a1[i]

            l2=""
            for j in range(s2):
                if a2[j] in match_char and hash_a2[j]==1:
                    l2+=a2[j]

#             print("\nCommon letters: \n{}\n{}".format(l1,l2))

            t=0
            
            #if the common letters are in same sequence, number of transpositions=0
            if l1==l2:
                t=0
                
            else:
                for i in range(len(l1)):
                    for j in range(len(l2)):
                        
                        #if character doesn't match at the corresponding position
                        if l1[i]!=l2[j]:
                            j=i
                            index=[]
                            
                            #finding indices where the character is found in the string at any other position
                            for b in range(len(l2)):
                                if l1[j]==l2[b]:
                                    index.append(b)
                            flag=0
                            
                           
                            for p in range(len(index)):
                                if flag==1:
                                    break
                                else:
                                    #checking if any of the indices position satisfies the max_limit condition
                                    if (abs(index[p]-j))<=max_limit and index[p]!=j:
                                        t+=1
                                        flag=1
                            break

            transp=t//2
#             print("\nNumber of transpositions (t):",transp)

            jaro_sim=(m/s1 + m/s2 + (m-transp)/m)/3
            jaro_sim=round(jaro_sim,2)

#             print("\nJaro similarity:\n",jaro_sim)
            jaro_dist=1-jaro_sim
            jaro_dist=round(jaro_dist,2)

            return jaro_dist


In [None]:
# Initializing Jaro distance matrix
n=100
dist_jaro=np.zeros((n,n))
print("Shape of distance matrix:",dist_jaro.shape)

In [None]:
def jaro_dist_all(x):
    for i in range(len(x)):
        xo=x[i]
        
        for j in range(len(x)):
            dist_jaro[i][j]=round(jaro_distance(xo,x[j]),2)
    return dist_jaro

dist_jaro=jaro_dist_all(s)
print("Jaro distance matrix:\n\n",dist_jaro)