# K-MEANS-CLUSTERING

**IMPORTING LIBRARIES**

In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#######
import collections #For fetching dictionary of labels & clusters
import nltk #Natural Language Toolkit
nltk.download('stopwords')  #STOPWORDS (I ,AM, A, AN, YOU, WE)
nltk.download('punkt')   #PUNCTUATION
from nltk import word_tokenize #Word tokenization is the process of splitting a large sample of text into words.
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer #Normalizing Sentences
########
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer
#######
from pprint import pprint


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


**LOADING DATASET**

In [20]:
sentences = pd.read_csv("/content/drive/My Drive/data/Quotes.csv")
sentences

Unnamed: 0,Quotes
0,Graphics designers are most creative people
1,Artificial Intelligence or AI is the last inve...
2,Snooker is a billiards sport for normally two ...
3,Snooker is played on a large (12 feet by 6 fee...
4,FOREX is the stock market for trading currencies
5,Software Engineering is hotter and hotter topi...
6,Love is blind
7,Snooker is popular in the United Kingdom and m...
8,The flying or operating of aircraft is known a...
9,AI is likely to be either the best or worst th...


**CONVERTING DATA INTO LIST**

In [21]:
sentences_list = sentences["Quotes"].tolist()


**DISPLAYING DATA AS LIST**

In [22]:
sentences_list

['Graphics designers are most creative people',
 'Artificial Intelligence or AI is the last invention - humans could ever make',
 'Snooker is a billiards sport for normally two players.',
 'Snooker is played on a large (12 feet by 6 feet) table that is covered with a smooth green material.',
 'FOREX is the stock market for trading currencies',
 'Software Engineering is hotter and hotter topic in Silicon Valley',
 'Love is blind',
 'Snooker is popular in the United Kingdom and many other countries',
 'The flying or operating of aircraft is known as aviation.',
 'AI is likely to be either the best or worst thing happen to humanity',
 'Design is Intelligence made visible.',
 'Falling in love is like being on drugs.',
 'There is only one happiness in Life to Love and to be loved.',
 "Boeing 777 is considered world's largest economical plane in the world of Aviation.",
 'Warren Buffet is famous for making good investments.He knows stock markets',
 'The biggest of the many uses of aviation a

# 1) sentences into list
# 2) tokenization i.e. spliting sentences into words
# 3) removing stop words(i,a,am,are ,you,me etc)
## 4) tfidf vectorization i.e. assigning words numbers.similar words are given same numbers 



**CREATING A FUNCTION FOR TOKENIZATION AND REMOVING STOP WORDS(STEMMER)**

In [23]:
#function to tokenize sentences and then removing stop words and then normailizing the remaining tokens 


def tokenizer(text):
  tokens = word_tokenize(text) #Word tokenization is the process of splitting a large sample of text into words.
  stemmer = PorterStemmer()

  #Removing Morphhological axes (stop words for normalizing)
  tokens = [stemmer.stem(t) for t in tokens if t not in stopwords.words('english')]
  return tokens


**CREATING A FUNCTION FOR TFIDF VECTORIZATION**

In [24]:
# TF-IDF enables us to gives us a way to associate each word in a document with a number that represents 
# how relevant each word is in that document. Then, documents with similar, relevant words will have 
# similar vectors, which is what we are looking for in a machine learning algorithm.

def cluster_sentences(sentences_list, k):

  tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenizer, stop_words=stopwords.words('english'),lowercase=True)
  tfidf_matrix = tfidf_vectorizer.fit_transform(sentences_list)

  kmeans = KMeans(n_clusters=k)
  kmeans.fit(tfidf_matrix)

  clusters = collections.defaultdict(list)

  for i, label in enumerate(kmeans.labels_):
    clusters[label].append(i)

  return dict(clusters)


** TESTING THE MODEL** 

In [25]:
k = 7
clusters = cluster_sentences(sentences_list,k)  #calling function
for cluster in range (k):             # CLUSTER 0:
  print("\nCLUSTER ",cluster,":\n")   # CLUSTER 0:
  for i, sentence in enumerate(clusters[cluster]):  #1: love is blind 2:falling in love us kike being on drugs
    print("\t",(i+1),": ",sentences_list[sentence]) #1: love is blind 2:falling in love us kike being on drugs




CLUSTER  0 :

	 1 :  Love is blind
	 2 :  Falling in love is like being on drugs.
	 3 :  There is only one happiness in Life to Love and to be loved.
	 4 :  Being in love is the number one reason why people wed.
	 5 :  Loving from a long distance actually strengthens a relationship
	 6 :  Real love is able to awaken your soul.

CLUSTER  1 :

	 1 :  Artificial Intelligence or AI is the last invention - humans could ever make
	 2 :  Google will fulfill its mission only when its search engine is AI - complete You guys know what that means? That's Artificial Intelligence.
	 3 :  Auomation is the biggest blessing given by Artificial Inteligence.

CLUSTER  2 :

	 1 :  The flying or operating of aircraft is known as aviation.
	 2 :  AI is likely to be either the best or worst thing happen to humanity
	 3 :  Boeing 777 is considered world's largest economical plane in the world of Aviation.
	 4 :  The biggest of the many uses of aviation are in air travel and military aircraft.
	 5 :  Aviatio

  'stop_words.' % sorted(inconsistent))
