In [None]:
import pandas as pd
import spacy
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load("en_core_web_md")

import sys
sys.path.insert(0, '../src')
from text_operations import TextPreprocessor

In [None]:
df = pd.read_csv("../../data_collection/data/labelled/masters_data_programs_india_usa.csv")
pgm_id = df['uni_name'] + " - " + df['pgm_name']
descr = df['descr']
text = df['descr'].apply(lambda x: TextPreprocessor(x).preprocess_text() if x!='Not inferred' else x)
df = pd.DataFrame({
    'id': pgm_id,
    'descr': descr,
    'text': text
})
df.head()

Unnamed: 0,id,descr,text
0,Indian Institute of Science Bangalore - M.Tech...,Computational Science and Data Science are int...,computational science data science interdiscip...
1,Indian Institute of Science Bangalore - M.Tech...,Artificial Intelligence has captured the ima...,artificial intelligence captured imagination e...
2,University of Hyderabad - MBA Business Analytics,The School has launched a two year MBA in Bus...,school launched year mba business analytics pr...
3,University of Hyderabad - M.Tech. Artificial I...,M.Tech Artificial Intelligence is also a four...,m tech artificial intelligence four semester c...
4,IIT Guwahati - M.Tech Data Science,Data Science is gaining prominence in academi...,data science gaining prominence academia indus...


In [None]:
df = pd.read_csv("../../data_collection/data/labelled/masters_data_programs_india_usa.csv")
texts = df.descr.values.tolist(); texts[0]

'Computational Science and Data Science are interdisciplinary areas that bring together the domain specific knowledge of science and engineering with relevant areas of computing systems and formal foundations. While computational science investigates scientific computing applications that require mathematical techniques and parallel computing, data science explores data-intensive applications that use scalable statistical and machine learning methods with Big Data and Cloud platforms. These impart foundational and scalable systems skills for computational and data sciences, with advanced courses selected by students to allow specialization on methods, platforms, and applications.'

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

2021-07-24 04:00:12.908380: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-07-24 04:00:12.908438: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [None]:
vectorizer = TfidfVectorizer(
    stop_words=STOP_WORDS
)

X = vectorizer.fit_transform(texts)
model = KMeans(n_clusters=15, init='k-means++',
               precompute_distances=True, random_state=2019,
               max_iter=100, n_init=16, n_jobs=16)

model.fit(X)

  'stop_words.' % sorted(inconsistent))


KMeans(max_iter=100, n_clusters=15, n_init=16, n_jobs=16,
       precompute_distances=True, random_state=2019)

In [None]:
df["cluster"] = model.predict(X)

In [None]:
df.cluster.value_counts()

1     25
14    18
13    16
7     15
3      9
4      6
12     5
2      3
9      3
0      2
5      2
8      2
10     2
11     2
6      1
Name: cluster, dtype: int64

In [None]:
df[df.cluster == 4].sample(3).descr.values.tolist()

["  Artificial Intelligence has captured the imagination of the entire world with its potential ability to solve complex societal problems of our times: universal access to healthcare and education, efficient transportation, increased efficiency in providing e-governance services to the public, etc. Focused national efforts are underway in many countries. The Government of India has initiated widespread discussion on how India should strive to be among the top nations in the world in the AI ecosystem. The two-year M.Tech. in Artificial Intelligence aims to fill the critical needs of the industry and to fill the gap in the availability of high-end AI scientists and engineers.�The vision of the M.Tech. (AI) program is to impart rigorous training in the foundations and deep technology of Artificial Intelligence to produce graduates who can become world leaders in AI and lead India's march towards leadership in this important area. Curriculum tailored to bring students to the forefront of 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import cross_val_predict

In [None]:
lr = LogisticRegression(n_jobs=2, penalty="l1",
                        multi_class="ovr", C=10.0,
                        random_state=2019, solver="saga")

proba = cross_val_predict(lr, X, df.cluster, cv=10, n_jobs=8, method="predict_proba")



In [None]:
preds = proba.argmax(axis=1)

In [None]:
accuracy_score(df.cluster, preds)

0.6306306306306306

In [None]:
lr.fit(X, df.cluster);
lr.coef_.shape



(5, 2502)

In [None]:
def get_important_words(cluster, lr, vectorizer, n=10):
    inv_vocab = {v:k for k,v in vectorizer.vocabulary_.items()}
    coef = lr.coef_[cluster]
    top_n = coef.argsort()[-n:][::-1]
    print([(inv_vocab[k], coef[k]) for k in top_n])

In [None]:
for c in sorted(df.cluster.unique()):
    print(f"Cluster: {c}")
    get_important_words(cluster=c, lr=lr, vectorizer=vectorizer, n=10)
    print(126*"#")

Cluster: 0
[('health', 9.761323449019645), ('methods', 6.0339646182494375), ('communication', 5.748201520978229), ('healthcare', 5.61735384406795), ('increasing', 5.531542209715788), ('deluge', 5.164560165420196), ('spatial', 3.6458737386694775), ('biomedical', 3.496601574249896), ('information', 3.3271256973621894), ('qualified', 3.2448514690507664)]
##############################################################################################################################
Cluster: 1
[('business', 15.593743799749983), ('analytics', 13.651869229110009), ('talent', 5.13511016066039), ('marketing', 4.314342902556408), ('big', 3.891360792107432), ('urban', 2.898426289049795), ('practical', 2.764544257073721), ('carry', 2.202545056747543), ('centric', 1.7651232418942497), ('storage', 1.5753822851356105)]
##############################################################################################################################
Cluster: 2
[('intelligence', 11.637919038627125), ('ai', 10

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=596b9f0a-2431-4aa3-878c-95287ebfbe9a' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>