### loading the dataset 

In [1]:
import sklearn.datasets as skd

categories = ['android', 'blockchain','cloud', 'ML','webdev']
news_train = skd.load_files('C:/Users/Sanketh/Desktop/newproject/ML part/dataset/train', categories= categories, encoding= 'ISO-8859-1')

In [2]:
news_train

{'data': ['Backend development (also stylized as back-end or back end development) is the skill that powers the web. Yet it does it modestly, without fanfareâ\x80\x94allowing people to browse their favorite sites without even knowing about all the work put in by the backend developer or team.\r\nFront-end Vs Backend Programming\r\n\r\nIntuitively, the other side of backend development is the front end. \r\n\r\nThe key difference is that while backend developers build how a website functions, front-end programmers build and design the interface, determining how the site looks to users. \r\n\r\nBackend web development lays the foundational code that enables websites to process the actions that users take on the front end and deliver the correct information in return.\r\nPHP powers 78.2% of all websites whose server-side programming language we know. The language was first released in 1995 when there were few options for building dynamic websites.\r\nwebsite website \r\nASP.NET is Microso

In [3]:
news_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [4]:
news_train['target_names']

['ML', 'android', 'blockchain', 'cloud', 'webdev']

In [5]:
news_train['target']

array([4, 0, 4, 2, 2, 4, 1, 3, 1, 0, 1, 3, 2, 1, 4, 2, 0, 0, 3, 3])

### vectorization of sentences 

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
text=['the quick dog jumped over the lazy dog',
     'the dog',
     'the quick']
vector=CountVectorizer()
vector.fit(text)
print('print vocabulary'+str(vector.vocabulary_)+'\n\n')
print('feature names'+str(vector.get_feature_names())+'\n\n')
count=vector.transform(text)
print('printing count'+str(count.toarray()))

print vocabulary{'the': 5, 'quick': 4, 'dog': 0, 'jumped': 1, 'over': 3, 'lazy': 2}


feature names['dog', 'jumped', 'lazy', 'over', 'quick', 'the']


printing count[[2 1 1 1 1 2]
 [1 0 0 0 0 1]
 [0 0 0 0 1 1]]


In [8]:
count_vec=CountVectorizer(stop_words='english')
X_train_tf=count_vec.fit_transform(news_train.data)
X_train_tf.shape

(20, 1445)

### Term-frequency inverse document frequency transformation

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer
transf=TfidfTransformer()
transf.fit(count)
print('Learing frequency'+str(transf.idf_)+'\n\n')
freq=transf.transform(count)
print('tranform matrix on base of learnt frequency'+str(freq.toarray()))

Learing frequency[1.28768207 1.69314718 1.69314718 1.69314718 1.28768207 1.        ]


tranform matrix on base of learnt frequency[[0.56345652 0.3704388  0.3704388  0.3704388  0.28172826 0.43757425]
 [0.78980693 0.         0.         0.         0.         0.61335554]
 [0.         0.         0.         0.         0.78980693 0.61335554]]


In [10]:
text='new text'
t=vector.transform([text])
transf.transform(t).shape

(1, 6)

In [11]:
transformer=TfidfTransformer()
X_train_tfidf=transformer.fit_transform(X_train_tf)
X_train_tfidf.shape

(20, 1445)

### Multinomial naive bayes classification

In [12]:
from sklearn.naive_bayes import MultinomialNB
clf=MultinomialNB().fit(X_train_tfidf,news_train.target)

In [13]:
news_test=skd.load_files('C:/Users/Sanketh/Desktop/newproject/ML part/dataset/test',categories= categories, encoding= 'ISO-8859-1')

In [14]:
news_test.data

['using etherium voting can be conducted',
 'classifying different news groups',
 'we are trying to develop a android application for blind people',
 'cloud resource provisiong and adaptation',
 'backend technology such as nodejs will be used in this project']

In [15]:
X_test=count_vec.transform(news_test.data)
X_test_tfidf=transformer.transform(X_test)
pred=clf.predict(X_test_tfidf)

In [16]:
from sklearn.metrics import accuracy_score
print('Accuracy is ',accuracy_score(pred,news_test.target))

Accuracy is  1.0


In [17]:
pred

array([2, 0, 1, 3, 4])

In [18]:
test=['In machine learing prediction method is used']
test=count_vec.transform(test)
test=transformer.transform(test)
test.shape

(1, 1445)

In [24]:
vector

<1x1445 sparse matrix of type '<class 'numpy.int64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [28]:
test=['html css']
test=count_vec.transform(test)
test=transformer.transform(test)
clf.predict(test)

array([4])

In [39]:
test=['cloud applications in nlp']
test=count_vec.transform(test)
test=transformer.transform(test)
clf.predict_proba(test)

array([[0.14305821, 0.14989375, 0.15696553, 0.40232029, 0.14776221]])

In [22]:
import pickle

In [23]:
pickle.dump(count_vec, open("vector.pickel", "wb"))

In [24]:
pickle.dump(transformer,open("tfidf.pickel","wb"))

In [25]:
pickle.dump(clf,open("nlpmodel.pickel","wb"))

In [26]:
pip freeze

appdirs==1.4.3
asn1crypto==0.24.0
attrs==19.1.0
backcall==0.1.0
bleach==3.1.0
bokeh==2.0.1
boto==2.49.0
boto3==1.12.11
botocore==1.15.11
certifi==2019.3.9
cffi==1.12.3
chardet==3.0.4
Click==7.0
cloudpickle==1.2.2
colorama==0.4.1
cryptography==2.6.1
cycler==0.10.0
decorator==4.4.0
defusedxml==0.6.0
distlib==0.3.0
Django==2.2.2
django-crispy-forms==1.7.2
django-filter==2.2.0
djangorestframework==3.11.0
docutils==0.15.2
entrypoints==0.3
filelock==3.0.12
Flask==1.1.1
future==0.18.2
gensim==3.8.1
gym==0.15.4
idna==2.8
importlib-metadata==1.5.0
inflection==0.3.1
ipykernel==5.1.1
ipython==7.5.0
ipython-genutils==0.2.0
ipywidgets==7.4.2
itsdangerous==1.1.0
jedi==0.13.3
Jinja2==2.10.1
jmespath==0.9.5
joblib==0.13.2
jsonschema==3.0.1
jupyter==1.0.0
jupyter-client==5.2.4
jupyter-console==6.0.0
jupyter-core==4.4.0
kiwisolver==1.1.0
Markdown==3.2.1
MarkupSafe==1.1.1
matplotlib==3.0.3
mistune==0.8.4
more-itertools==5.0.0
nbconvert==5.5.0
nbformat==4.4.0
ndg-httpsclient==0.5.1
notebook==5.7.8
numpy==

In [57]:
c = data.tocoo()                                                                                        
df = pd.DataFrame({'node1': c.row, 'node2': c.col, 'edge_weight': c.data}) 

In [58]:
import pandas as pd

In [70]:
t1=count_vec.transform([news_test.data[0]])
print(t1)

  (0, 268)	1
  (0, 475)	1
  (0, 1364)	1
  (0, 1398)	1


In [71]:
t2=transformer.transform(t1)
print(t2)

  (0, 1398)	0.536007290924944
  (0, 1364)	0.2314278815635678
  (0, 475)	0.536007290924944
  (0, 268)	0.6097815213546603


In [72]:
t3=count_vec.transform([news_test.data[1]])
print(t3)

  (0, 219)	1
  (0, 391)	1
  (0, 597)	1


In [78]:
t4=transformer.transform(t3)
print(t4.shape)

(1, 1445)


In [77]:
ne=news_test.data[0]+news_test.data[1]
print(transformer.transform(count_vec.transform([ne])).shape)

(1, 1445)


In [80]:
import scipy
tra=scipy.sparse.hstack([t2, t4])

In [92]:
m=(t2.toarray()+t4.toarray())/2

In [93]:
clf.predict(m)

array([0])

In [225]:
text1=['blockchain']
text3=['ml machine learning']

In [214]:
t1=transformer.transform(count_vec.transform(text3))

In [226]:
t=transformer.transform(count_vec.transform(text1))

In [227]:
print(t1)

  (0, 839)	0.6653096504335756
  (0, 785)	0.6003390133894425
  (0, 740)	0.4437974065297633


In [228]:
print(t)

  (0, 152)	1.0


In [293]:
k=(t.toarray()+t1.toarray())/2

In [294]:
print(k[0][839])

0.3326548252167878


In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
k=transf.transform(vector.transform(['this is ml project']))
k=k.toarray()
cosine_similarity(k.T,X_train_tfidf).sort()

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 1 while Y.shape[1] == 1445

In [279]:
j=transformer.transform(count_vec.transform(['this is is related html css']))

In [298]:
similarity_scores = X_train_tfidf.dot(j.toarray().T)    # get sorted similarity score indices  
sorted_indicies = np.argsort(similarity_scores, axis = 0)[::-1]

In [299]:
import numpy as np

In [300]:
sorted_indicies

array([[14],
       [ 2],
       [ 9],
       [ 6],
       [19],
       [ 8],
       [15],
       [13],
       [12],
       [11],
       [10],
       [18],
       [ 7],
       [16],
       [ 5],
       [ 4],
       [ 3],
       [17],
       [ 1],
       [ 0]], dtype=int64)

In [301]:
news_train.data[2]

"\r\nBasic setup and learning how to learn\r\nWeb standards and best practices (such as accessibility and cross-browser compatibility)\r\nHTML, the language that gives web content structure and meaning\r\nCSS, the language used to style web pages\r\nJavaScript, the scripting language used to create dynamic functionality on the web\r\nTooling that is used to facilitate modern client-side web development.\r\n\r\nYou can work through sections in order, but each one is also self-contained. For example, if you already know HTML, you can skip ahead to the CSS section.\r\nBy the end of the course, you'll have all the tools and practical knowledge necessary to build visually rich Single page Web applications, try your luck with job interviews and kickstart your career as a Front End Developer!\r\nAfter completion you'll have touched upon the comprehensive curriculum of a Junior Front End Developer. Try your luck, applying for jobs, getting feedbacks and improving on the solid foundations built

In [258]:
similarity_scores

array([[0.        ],
       [0.06444192],
       [0.01079077],
       [0.26588151],
       [0.10953974],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.00667336],
       [0.26763507],
       [0.        ],
       [0.        ],
       [0.23369575],
       [0.        ],
       [0.        ],
       [0.14934626],
       [0.12557611],
       [0.14000697],
       [0.        ],
       [0.        ]])

In [266]:
cosine_similarity(X_train_tfidf,j)

array([[0.        ],
       [0.        ],
       [0.16315791],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.03478984],
       [0.        ],
       [0.0226425 ],
       [0.04032682],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.24236576],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.02694342]])

In [303]:
sorted_indicies = np.argsort(cosine_similarity(X_train_tfidf,j), axis = 0)[::-1]

In [304]:
sorted_indicies

array([[14],
       [ 2],
       [ 9],
       [ 6],
       [19],
       [ 8],
       [15],
       [13],
       [12],
       [11],
       [10],
       [18],
       [ 7],
       [16],
       [ 5],
       [ 4],
       [ 3],
       [17],
       [ 1],
       [ 0]], dtype=int64)

In [302]:
cosine_similarity(X_train_tfidf,j.toarray())

array([[0.        ],
       [0.        ],
       [0.16315791],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.03478984],
       [0.        ],
       [0.0226425 ],
       [0.04032682],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.24236576],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.02694342]])

In [305]:
l=j.toarray()

In [310]:
np.frombuffer(l.tostring(),dtype='int')

array([0, 0, 0, ..., 0, 0, 0])

In [23]:
news_train

{'data': ['Backend development (also stylized as back-end or back end development) is the skill that powers the web. Yet it does it modestly, without fanfareâ\x80\x94allowing people to browse their favorite sites without even knowing about all the work put in by the backend developer or team.\r\nFront-end Vs Backend Programming\r\n\r\nIntuitively, the other side of backend development is the front end. \r\n\r\nThe key difference is that while backend developers build how a website functions, front-end programmers build and design the interface, determining how the site looks to users. \r\n\r\nBackend web development lays the foundational code that enables websites to process the actions that users take on the front end and deliver the correct information in return.\r\nPHP powers 78.2% of all websites whose server-side programming language we know. The language was first released in 1995 when there were few options for building dynamic websites.\r\nwebsite website \r\nASP.NET is Microso