# One Hot Encoding

In [1]:

import pandas as pd
import numpy as np
# Sample categorical data
data = np.array([['Red'], ['Green'], ['Blue'], ['Red'], ['Blue']])
df=pd.DataFrame(data,columns=['Color'])
df

Unnamed: 0,Color
0,Red
1,Green
2,Blue
3,Red
4,Blue


In [2]:
from sklearn.preprocessing import OneHotEncoder
# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)
# Fit and transform the data
encoded_data = encoder.fit_transform(df[['Color']])

In [3]:
encoded_data

array([[0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [4]:
# Get the names of the new columns from the encoder
new_column_names = encoder.categories_[0]

# Create a new DataFrame from the encoded data with appropriate column names
encoded_df = pd.DataFrame(encoded_data, columns=new_column_names, index=df.index)

# Concatenate the original DataFrame (excluding the original 'Color' column) and the new encoded DataFrame
df1 = pd.concat([df.drop('Color', axis=1), encoded_df], axis=1)

# Display the head of the updated DataFrame
df1.head()

Unnamed: 0,Blue,Green,Red
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0


# Bag Of Words (BOW)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
# Create the BoW transformer
vectorizer = CountVectorizer()

In [7]:
vectorizer.fit_transform(df['Color']).toarray()

array([[0, 0, 1],
       [0, 1, 0],
       [1, 0, 0],
       [0, 0, 1],
       [1, 0, 0]])

In [8]:
df3=pd.DataFrame(vectorizer.fit_transform(df['Color']).toarray(),columns=vectorizer.get_feature_names_out())

In [9]:
df3.head()

Unnamed: 0,blue,green,red
0,0,0,1
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0


In [10]:
df = pd.DataFrame({"text":["people watch dswithbappy",
                         "dswithbappy watch dswithbappy",
                         "people write comment",
                          "dswithbappy write comment"],"output":[1,1,0,0]})

df


Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [11]:
bow = vectorizer.fit_transform(df['text'])

In [12]:
bow.toarray()

array([[0, 1, 1, 1, 0],
       [0, 2, 0, 1, 0],
       [1, 0, 1, 0, 1],
       [1, 1, 0, 0, 1]])

In [13]:
df3=pd.DataFrame(vectorizer.fit_transform(df['text']).toarray(),columns=vectorizer.get_feature_names_out())

In [14]:
df3.head()

Unnamed: 0,comment,dswithbappy,people,watch,write
0,0,1,1,1,0
1,0,2,0,1,0
2,1,0,1,0,1
3,1,1,0,0,1


In [15]:
print(vectorizer.vocabulary_)

{'people': 2, 'watch': 3, 'dswithbappy': 1, 'write': 4, 'comment': 0}


#N-Grams

In [16]:
df = pd.DataFrame({"text":["people watch dswithrazib",
                         "dswithrazib watch dswithrazib",
                         "people write comment",
                          "dswithrazib write comment"],"output":[1,1,0,0]})

df


Unnamed: 0,text,output
0,people watch dswithrazib,1
1,dswithrazib watch dswithrazib,1
2,people write comment,0
3,dswithrazib write comment,0


# Bi Gram

In [17]:
# BI grams
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(2,2))


In [18]:
biGram=vectorizer.fit_transform(df['text'])

In [19]:
biGram.toarray()

array([[0, 0, 1, 0, 1, 0],
       [1, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 1],
       [0, 1, 0, 0, 0, 1]])

In [20]:
#vocabulary
print(vectorizer.vocabulary_)

{'people watch': 2, 'watch dswithrazib': 4, 'dswithrazib watch': 0, 'people write': 3, 'write comment': 5, 'dswithrazib write': 1}


In [21]:
df3=pd.DataFrame(vectorizer.fit_transform(df['text']).toarray(),columns=vectorizer.get_feature_names_out())

In [22]:
df3.head()

Unnamed: 0,dswithrazib watch,dswithrazib write,people watch,people write,watch dswithrazib,write comment
0,0,0,1,0,1,0
1,1,0,0,0,1,0
2,0,0,0,1,0,1
3,0,1,0,0,0,1


# Ti Gram

In [23]:
vectorizer = CountVectorizer(ngram_range=(3,3))
tiGram=vectorizer.fit_transform(df['text']).toarray()
print(vectorizer.vocabulary_)

{'people watch dswithrazib': 2, 'dswithrazib watch dswithrazib': 0, 'people write comment': 3, 'dswithrazib write comment': 1}


In [24]:
df3=pd.DataFrame(vectorizer.fit_transform(df['text']).toarray(),columns=vectorizer.get_feature_names_out())
df3.head()

Unnamed: 0,dswithrazib watch dswithrazib,dswithrazib write comment,people watch dswithrazib,people write comment
0,0,0,1,0
1,1,0,0,0
2,0,0,0,1
3,0,1,0,0


#TF-IDF (Term frequency- Inverse document frequency)

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid= TfidfVectorizer()

In [26]:
df = pd.DataFrame({"text":["people watch dswithrazib",
                         "dswithrazib watch dswithrazib",
                         "people write comment",
                          "dswithrazib write comment"],"output":[1,1,0,0]})

df

Unnamed: 0,text,output
0,people watch dswithrazib,1
1,dswithrazib watch dswithrazib,1
2,people write comment,0
3,dswithrazib write comment,0


In [27]:
tf=tfid.fit_transform(df['text']).toarray()
tf

array([[0.        , 0.49681612, 0.61366674, 0.61366674, 0.        ],
       [0.        , 0.8508161 , 0.        , 0.52546357, 0.        ],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027],
       [0.61366674, 0.49681612, 0.        , 0.        , 0.61366674]])

In [28]:
# Create a DataFrame from the TF-IDF data
# Use the get_feature_names_out() method from the tfid object
df3=pd.DataFrame(tfid.fit_transform(df['text']).toarray(),columns=tfid.get_feature_names_out())
df3.head()

Unnamed: 0,comment,dswithrazib,people,watch,write
0,0.0,0.496816,0.613667,0.613667,0.0
1,0.0,0.850816,0.0,0.525464,0.0
2,0.57735,0.0,0.57735,0.0,0.57735
3,0.613667,0.496816,0.0,0.0,0.613667


# Word2vec

In [29]:
!pip install gensim





In [30]:
import numpy as np
import pandas as pd
 # This import should now work after successful installation
import os

In [32]:
from gensim.models import Word2Vec


from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [38]:
story = []
for filename in os.listdir('data'):
    # Check if the item is a directory before attempting to open it
    if filename == '.ipynb_checkpoints':
        pass
    else:
        # Construct the full path to the file
        filepath = os.path.join('data', filename)
        # Open and process the file only if it's not the checkpoint directory
        f = open(filepath)
        corpus = f.read()
        raw_sent = sent_tokenize(corpus)
        for sent in raw_sent:
            story.append(simple_preprocess(sent))

In [39]:
story

[['game',
  'of',
  'thrones',
  'book',
  'one',
  'of',
  'song',
  'of',
  'ice',
  'and',
  'fire',
  'by',
  'george',
  'martin',
  'prologue',
  'we',
  'should',
  'start',
  'back',
  'gared',
  'urged',
  'as',
  'the',
  'woods',
  'began',
  'to',
  'grow',
  'dark',
  'around',
  'them'],
 ['the', 'wildlings', 'are', 'dead'],
 ['do', 'the', 'dead', 'frighten', 'you'],
 ['ser',
  'waymar',
  'royce',
  'asked',
  'with',
  'just',
  'the',
  'hint',
  'of',
  'smile'],
 ['gared', 'did', 'not', 'rise', 'to', 'the', 'bait'],
 ['he',
  'was',
  'an',
  'old',
  'man',
  'past',
  'fifty',
  'and',
  'he',
  'had',
  'seen',
  'the',
  'lordlings',
  'come',
  'and',
  'go'],
 ['dead', 'is', 'dead', 'he', 'said'],
 ['we', 'have', 'no', 'business', 'with', 'the', 'dead'],
 ['are', 'they', 'dead'],
 ['royce', 'asked', 'softly'],
 ['what', 'proof', 'have', 'we'],
 ['will', 'saw', 'them', 'gared', 'said'],
 ['if',
  'he',
  'says',
  'they',
  'are',
  'dead',
  'that',
  'proof',


In [40]:
len(story)

8602

In [59]:
model=Word2Vec(
    window=10,
    min_count=2,

)

In [60]:
model.build_vocab(story)

In [61]:
model.train(story,total_examples=model.corpus_count,epochs=model.epochs)

(322617, 447775)

In [62]:
vec=model.wv.get_normed_vectors()
vec

array([[-0.13145702,  0.06365172,  0.10586422, ..., -0.09690707,
         0.08868233,  0.02110688],
       [-0.12811744,  0.0638429 ,  0.0994866 , ..., -0.094146  ,
         0.08409312,  0.01118333],
       [-0.09497271,  0.04719372,  0.07837666, ..., -0.08718456,
         0.081095  , -0.01605231],
       ...,
       [-0.07618805,  0.1313211 ,  0.11527691, ..., -0.11440054,
         0.08316997,  0.04418523],
       [-0.08739878,  0.08245282,  0.05122644, ..., -0.11939844,
         0.05907656, -0.03051116],
       [-0.00430105,  0.12214058, -0.01432589, ..., -0.06721959,
         0.0897169 , -0.02310034]], dtype=float32)

In [63]:
vec[0]

array([-0.13145702,  0.06365172,  0.10586422,  0.10237828, -0.08683284,
       -0.18509564,  0.11946128,  0.2642084 , -0.16613904, -0.04919754,
       -0.03021148, -0.19147062, -0.07489166,  0.04662741, -0.05611612,
       -0.03275659,  0.03226278, -0.08420098, -0.00891824, -0.23570307,
        0.11990893,  0.09381065,  0.12152522, -0.05135298, -0.06936532,
       -0.0443745 , -0.01774943,  0.00558727, -0.13847637, -0.00422782,
        0.10784048, -0.00295058,  0.08890814, -0.11853271, -0.12714162,
        0.10334813,  0.14423858, -0.10218174, -0.09512444, -0.291336  ,
        0.04505933, -0.10816307, -0.11457403,  0.0162235 ,  0.06538014,
       -0.03154401, -0.16573557,  0.02590968,  0.16061512,  0.01464643,
       -0.02337595, -0.09851264, -0.03406306, -0.03238548, -0.0269802 ,
       -0.02266851,  0.08388206, -0.06724488, -0.12134758,  0.08205374,
        0.00286924,  0.02083295,  0.0349045 , -0.09557126, -0.14095671,
        0.13850582,  0.06096084,  0.11437329, -0.07534221,  0.10

In [64]:
len(vec[0])

100

In [65]:
model.wv.most_similar('daenerys')

[('still', 0.9993079900741577),
 ('fire', 0.9992390275001526),
 ('while', 0.999233067035675),
 ('then', 0.9992324709892273),
 ('between', 0.9992311000823975),
 ('an', 0.9992216229438782),
 ('three', 0.9992095828056335),
 ('dothraki', 0.9992045164108276),
 ('all', 0.9991854429244995),
 ('off', 0.9991826415061951)]

In [66]:
model.wv.similarity('arya','sansa')

0.9997457

In [67]:
y = model.wv.index_to_key

In [68]:
#dimentioin reduces
from sklearn.decomposition import PCA
pca=PCA(n_components=3)
X = pca.fit_transform(model.wv.get_normed_vectors())

In [69]:
X

array([[-2.0580053e-02, -1.8919581e-01, -3.5152095e-04],
       [-3.0902445e-02, -1.5098859e-01,  3.8051764e-03],
       [-5.4147482e-02,  3.0626601e-02,  1.5673158e-04],
       ...,
       [ 7.6437056e-02,  3.2856338e-02,  2.7861740e-03],
       [-9.8769069e-03, -7.0941001e-02, -1.5182365e-02],
       [ 7.7795506e-02,  1.5179056e-01,  7.3908113e-02]], dtype=float32)

In [70]:
len(X[0])

3

In [71]:
import plotly.express as px
fig = px.scatter_3d(X[200:300],x=0,y=1,z=2, color=y[200:300])
fig.show()