Original code by ***__Raghavender Ganesh__***.   
Updated with better documentation and code readability.

## ***__3B. Information Extraction from Unstructured Data__***

### ***__Libraries__***

In [None]:
import re
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [None]:
import warnings
warnings.filterwarnings('ignore')

### ***__Sample Unstructured data__***

In [None]:
texts = [
    "Patient has a history of hypertension and diabetes . Prescribed medication X .",
    "Asthma diagnosis confirmed . Patient advised to use inhaler daily .",
    "Hypertension patient . Needs regular monitoring of blood pressure .",
    "Diabetes patient . Recommended diet and exercise .",
    "Patient diagnosed with hypertension . Medication Y prescribed ."]

### ***__Loading dataset into pandas dataframe__***

In [None]:
df = pd.DataFrame({"Medical_Texts": texts})
df.head()

### ***__Preprocessing__***

In [None]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r"\d+", " ", text)
    text = re.sub(r"\W+", " ", text)
    text_list = [words for words in text.split() 
                 if words not in stopwords.words("english") 
                 and words not in string.punctuation]
    return " ".join(text_list)

df['Medical_Texts'] = df['Medical_Texts'].apply(preprocess)
df.head()

### ***__TF-IDF Vectorization__***

In [None]:
vectorizer = TfidfVectorizer(max_features = 10)
x_tfidf = vectorizer.fit_transform(df['Medical_Texts']).toarray()

x_tfidf[:5]

### ***__K-Means Clustering__***

In [None]:
kmeans = KMeans(n_clusters = 2)
predictions = kmeans.fit_predict(x_tfidf)

predictions[:5]

### ***__Generate and visualize WordCloud__***

In [None]:
all_texts = " ".join(df['Medical_Texts'])
wordcloud = WordCloud(width = 800, height = 400, background_color = 'black').generate(all_texts)

plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')