In [1]:
import re
import os
import pandas as pd
import numpy as np
import nltk
import contractions
import pickle
import torch
from nltk.corpus import stopwords, wordnet
from spellchecker import SpellChecker
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel

data_path = 'Data Objects/'

In [2]:
"""
import sys
import numpy
numpy.set_printoptions(threshold=sys.maxsize)
"""

'\nimport sys\nimport numpy\nnumpy.set_printoptions(threshold=sys.maxsize)\n'

# 2. TEXT CLEANING AND VECTORIZATION
# 2.1 Text Cleaning

## 2.1.1 Importing Data

In [2]:
df = pd.read_csv(os.path.join(data_path,'speaker_only.csv'))

In [4]:
df.dtypes

Question     object
Text         object
Condition    object
dtype: object

In [5]:
#df['Question'][0]

In [6]:
#df['Text'][1]

In [3]:
df

Unnamed: 0,Question,Text,Condition
0,do you feel like you do creative things with l...,how do you mean language \n well the only pro...,clinical
1,do you feel like you use language creatively?,I do erm I I write song lyrics erm poetry...,clinical
2,do you feel like you use language creatively?,can you repeat that \n yes I do to some exten...,clinical
3,do you feel like you do creative things with ...,right erm er I'm er do I do creative things...,clinical
4,do you feel like you do creative things with l...,OK \n yeah \n erm I suppose it depends what...,clinical
5,do you feel like you use language creatively?,OK OK \n you know I when I came to England a...,clinical
6,do you feel like you use language creatively?,do I do creative things with language y er er...,clinical
7,do you feel like you do creative things with l...,OK yeah \n yeah w so what university are you ...,clinical
8,do you feel like you do creative things with l...,er sometimes I would think more colourfully t...,clinical
9,do you feel like you use language creatively?,do I feel like I use language creatively mm d...,clinical


In [7]:
sentence_data = []

for index, row in df.iterrows():
    text = re.sub(r'\n', '. ',row['Text'])
    sentences = sent_tokenize(text)
    condition = row['Condition']
    
    for sentence in sentences:
        sentence_data.append({'Text': sentence, 'Condition': condition})

split_df = pd.DataFrame(sentence_data, columns=['Text', 'Condition'])

In [8]:
split_df

Unnamed: 0,Text,Condition
0,how do you mean language .,clinical
1,well the only problem I got is learning diffic...,clinical
2,you know and er it e effects my my feelings y ...,clinical
3,er I don't wanna have a relapse as the doctor ...,clinical
4,and like erm when you get older like my age an...,clinical
...,...,...
1772,yep .,comparison
1773,yeah .,comparison
1774,but no this was really fun .,comparison
1775,this was really fun .,comparison


In [9]:
split_df_clean = split_df.copy()
df_clean = df.copy()

## 2.1.2 Text Cleaning

In [10]:
stop_words = set(stopwords.words('english'))

def clean_text(df,feature):
    df[feature] = df[feature].apply(lambda x: x.lower()) #Lowercase letters
    df[feature] = df[feature].apply(lambda x: re.sub(r'\d+', '', x)) # Digits
    df[feature] = df[feature].apply(lambda x: contractions.fix(x)) # Expand Contractions
    df[feature] = df[feature].apply(lambda x: re.sub(r'[^\w]+', ' ', re.sub(r'\s+', ' ', x)).strip())
    df[feature] = df[feature].apply(lambda text: ' '.join([word for word in text.split() if word not in stop_words]))

In [17]:
clean_text(split_df_clean,'Text')
clean_text(df_clean,'Text')

## 2.1.3 TOKENIZATION

In [18]:
split_df_clean['Tokens'] = split_df_clean['Text'].apply(lambda x: word_tokenize(x)) 
df_clean['Tokens'] = df_clean['Text'].apply(lambda x: word_tokenize(x)) 

In [19]:
split_df_clean['Tokens'][1]

['well',
 'problem',
 'got',
 'learning',
 'difficulties',
 'speech',
 'language',
 'sometimes',
 'get',
 'erm',
 'emotional',
 'erm',
 'problems',
 'said',
 'got',
 'learning',
 'difficulties',
 'erm',
 'er',
 'learning',
 'difficulties',
 'like',
 'sometimes',
 'get',
 'panicky']

## 2.1.4 SPELL CHECKER & CORPUS CHECK

In [20]:
spell = SpellChecker()
english_words = set(nltk.corpus.words.words())

In [21]:
# Only spell checker
split_df_clean['Spell_Tokens'] = split_df_clean['Tokens'].apply(lambda x: [spell.correction(token) for token in x])

df_clean['Spell_Tokens'] = df_clean['Tokens'].apply(lambda x: [spell.correction(token) for token in x])

In [22]:
# Spell checker and corpus words check
split_df_clean['Spell_Corpus_Tokens'] = split_df_clean['Tokens'].apply(lambda x: [spell.correction(token) if spell.correction(token) in english_words else token for token in x])

df_clean['Spell_Corpus_Tokens'] = df_clean['Tokens'].apply(lambda x: [spell.correction(token) if spell.correction(token) in english_words else token for token in x])

In [23]:
# Eliminated words after corpus check
# Only see words that are in 'Spell_Tokens' but not in 'Spell_Corpus_Tokens'

# compare_col1 = 'Spell_Corpus_Tokens'
# compare_col2 = 'Spell_Tokens'

def eliminated_words(df,compare_col1,compare_col2):
    eliminated_words = {}

    for n in range(len(df)):
        if df[compare_col1][n] != df[compare_col2][n]:
            words_difference = list(set(df[compare_col2][n]) - set(df[compare_col1][n]))
            eliminated_words[n] = words_difference

    return eliminated_words

In [24]:
split_eliminated = eliminated_words(split_df_clean,'Spell_Corpus_Tokens','Spell_Tokens')
print(split_eliminated,"\n")

eliminated = eliminated_words(df_clean,'Spell_Corpus_Tokens','Spell_Tokens')
print(eliminated)

{40: ['andi'], 89: [None], 91: [None], 96: ['himl', 'rags', None], 97: ['pigs'], 99: ['minutes'], 104: ['pigs', 'himl', 'reacts', None, 'views'], 105: ['himl'], 107: ['overkill', 'username', None], 110: ['himl'], 111: ['divides', 'dies'], 112: ['im', 'himl', 'dies'], 114: ['programming', None], 118: ["asperger's"], 119: [None], 124: ['gimme'], 125: [None], 128: [None], 134: ['factoring', None], 135: [None], 136: [None], 139: ['plains', 'himl', None], 140: [None], 141: [None], 142: [None], 143: ['lloyd', None], 146: ['nixon'], 147: [None], 148: [None], 149: ['nixon', None], 152: ["asperger's"], 155: [None], 159: ['himl', None], 160: ['jak', 'cups', 'meds', None], 163: [None], 165: ['himl', None], 166: ['himl'], 171: [None], 174: ["alzheimer's"], 178: [None], 183: [None, 'views', 'himl', 'reacts'], 220: [None], 221: [None], 231: [None], 232: [None], 233: [None], 234: [None], 239: [None], 278: [None], 297: ['understa'], 318: ['internet'], 319: ['elliot'], 361: ['assassinated'], 363: ['asi

In [25]:
split_words_to_check = ['minutes','dies','divides','programming',"asperger's",'gimme','factoring','lloyd','nixon',
                        'cups','meds',"alzheimer's",'internet','elliot','assassinated','asian','summat','sarah',
                        'carries','founders','freestyle','basing','dictating']

In [26]:
words_to_check = split_words_to_check + ['rags','overkill','pigs','plains','dies','username','views',
                                                    'reacts','mars','mom','sarah','maximise','maximise','shared',
                                                    'distractions','mic','distracting']

In [27]:
# Adding back meaningful eliminated words

def add_words(df, col, eliminated_words, words_to_check):
    #print("Words to check:",words_to_check,"\n")
    eliminated_words_filtered = {}

    for key, value in eliminated_words.items():
        filtered_words = [word for word in value if word in words_to_check and word is not None]
        if filtered_words:
            eliminated_words_filtered[key] = filtered_words
            
    print("Eliminated Words Filtered:\n")
    print(eliminated_words_filtered)
            
    indexes_to_find = eliminated_words_filtered.keys() 

    eliminated_words = [eliminated_words_filtered[index] for index in indexes_to_find]
    
    for idx, values in zip(indexes_to_find, eliminated_words):
        words_to_add = [word for word in values if word not in df[col][idx]]
        df.at[idx, col] += words_to_add

In [28]:
add_words(split_df_clean, 'Spell_Corpus_Tokens', split_eliminated, split_words_to_check)

add_words(df_clean, 'Spell_Corpus_Tokens', eliminated, words_to_check)

Eliminated Words Filtered:

{99: ['minutes'], 111: ['divides', 'dies'], 112: ['dies'], 114: ['programming'], 118: ["asperger's"], 124: ['gimme'], 134: ['factoring'], 143: ['lloyd'], 146: ['nixon'], 149: ['nixon'], 152: ["asperger's"], 160: ['cups', 'meds'], 174: ["alzheimer's"], 318: ['internet'], 319: ['elliot'], 361: ['assassinated'], 363: ['asian'], 779: ['freestyle'], 928: ['summat'], 947: ['sarah'], 948: ['sarah'], 950: ['sarah'], 951: ['summat'], 958: ['summat'], 967: ['summat'], 972: ['summat'], 978: ['summat'], 1029: ['carries'], 1033: ['founders'], 1389: ['dictating'], 1451: ['basing']}
Eliminated Words Filtered:

{1: ['factoring', 'reacts', "alzheimer's", 'dies', 'rags', 'cups', "asperger's", 'lloyd', 'plains', 'views', 'nixon', 'username', 'divides', 'pigs', 'gimme', 'overkill', 'meds'], 3: ['elliot', 'asian', 'internet'], 5: ['mars', 'mom'], 6: ['mom'], 8: ['mom'], 9: ['mom'], 12: ['sarah', 'summat'], 13: ['carries', 'summat', 'founders'], 15: ['mom'], 18: ['mom'], 19: ['mo

## 2.1.5 LEMMATIZATION

In [29]:
def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

split_df_clean['Final_Tokens'] = split_df_clean['Spell_Corpus_Tokens'].apply(lemmatize_tokens)

df_clean['Final_Tokens'] = df_clean['Spell_Corpus_Tokens'].apply(lemmatize_tokens)

In [30]:
split_df_clean['Final_Tokens']

0                                        [mean, language]
1       [well, problem, got, learning, difficulty, spe...
2       [know, er, e, effect, feeling, mean, feeling, ...
3       [er, want, relapse, doctor, say, mean, erm, kn...
4       [like, erm, get, older, like, age, know, diffe...
                              ...                        
1772                                                [yep]
1773                                               [yeah]
1774                                        [really, fun]
1775                                        [really, fun]
1776                                                 [ok]
Name: Final_Tokens, Length: 1777, dtype: object

In [31]:
split_df_clean

Unnamed: 0,Text,Condition,Tokens,Spell_Tokens,Spell_Corpus_Tokens,Final_Tokens
0,mean language,clinical,"[mean, language]","[mean, language]","[mean, language]","[mean, language]"
1,well problem got learning difficulties speech ...,clinical,"[well, problem, got, learning, difficulties, s...","[well, problem, got, learning, difficulties, s...","[well, problem, got, learning, difficulties, s...","[well, problem, got, learning, difficulty, spe..."
2,know er e effects feelings mean feelings feeli...,clinical,"[know, er, e, effects, feelings, mean, feeling...","[know, er, e, effects, feelings, mean, feeling...","[know, er, e, effects, feelings, mean, feeling...","[know, er, e, effect, feeling, mean, feeling, ..."
3,er want relapse doctor says mean erm know know...,clinical,"[er, want, relapse, doctor, says, mean, erm, k...","[er, want, relapse, doctor, says, mean, erm, k...","[er, want, relapse, doctor, says, mean, erm, k...","[er, want, relapse, doctor, say, mean, erm, kn..."
4,like erm get older like age know differences p...,clinical,"[like, erm, get, older, like, age, know, diffe...","[like, erm, get, older, like, age, know, diffe...","[like, erm, get, older, like, age, know, diffe...","[like, erm, get, older, like, age, know, diffe..."
...,...,...,...,...,...,...
1772,yep,comparison,[yep],[yep],[yep],[yep]
1773,yeah,comparison,[yeah],[yeah],[yeah],[yeah]
1774,really fun,comparison,"[really, fun]","[really, fun]","[really, fun]","[really, fun]"
1775,really fun,comparison,"[really, fun]","[really, fun]","[really, fun]","[really, fun]"


In [32]:
df_clean

Unnamed: 0,Question,Text,Condition,Tokens,Spell_Tokens,Spell_Corpus_Tokens,Final_Tokens
0,do you feel like you do creative things with l...,mean language well problem got learning diffic...,clinical,"[mean, language, well, problem, got, learning,...","[mean, language, well, problem, got, learning,...","[mean, language, well, problem, got, learning,...","[mean, language, well, problem, got, learning,..."
1,do you feel like you use language creatively?,erm write song lyrics erm poetry time er mostl...,clinical,"[erm, write, song, lyrics, erm, poetry, time, ...","[erm, write, song, lyrics, erm, poetry, time, ...","[erm, write, song, lyrics, erm, poetry, time, ...","[erm, write, song, lyric, erm, poetry, time, e..."
2,do you feel like you use language creatively?,repeat yes extent speak every day fellow resid...,clinical,"[repeat, yes, extent, speak, every, day, fello...","[repeat, yes, extent, speak, every, day, fello...","[repeat, yes, extent, speak, every, day, fello...","[repeat, yes, extent, speak, every, day, fello..."
3,do you feel like you do creative things with ...,right erm er er creative things language think...,clinical,"[right, erm, er, er, creative, things, languag...","[right, erm, er, er, creative, things, languag...","[right, erm, er, er, creative, things, languag...","[right, erm, er, er, creative, thing, language..."
4,do you feel like you do creative things with l...,ok yeah erm suppose depends mean erm tend crea...,clinical,"[ok, yeah, erm, suppose, depends, mean, erm, t...","[ok, yeah, erm, suppose, depends, mean, erm, t...","[ok, yeah, erm, suppose, depends, mean, erm, t...","[ok, yeah, erm, suppose, depends, mean, erm, t..."
5,do you feel like you use language creatively?,ok ok know came england eleven road went infan...,clinical,"[ok, ok, know, came, england, eleven, road, we...","[ok, ok, know, came, england, eleven, road, we...","[ok, ok, know, came, england, eleven, road, we...","[ok, ok, know, came, england, eleven, road, we..."
6,do you feel like you use language creatively?,creative things language er erm mm mm trick qu...,clinical,"[creative, things, language, er, erm, mm, mm, ...","[creative, things, language, er, erm, my, my, ...","[creative, things, language, er, erm, my, my, ...","[creative, thing, language, er, erm, my, my, t..."
7,do you feel like you do creative things with l...,ok yeah yeah w university mm yeah yeah yeah er...,clinical,"[ok, yeah, yeah, w, university, mm, yeah, yeah...","[ok, yeah, yeah, i, university, my, yeah, yeah...","[ok, yeah, yeah, i, university, my, yeah, yeah...","[ok, yeah, yeah, i, university, my, yeah, yeah..."
8,do you feel like you do creative things with l...,er sometimes would think colourfully creativel...,clinical,"[er, sometimes, would, think, colourfully, cre...","[er, sometimes, would, think, colorfully, crea...","[er, sometimes, would, think, colorfully, crea...","[er, sometimes, would, think, colorfully, crea..."
9,do you feel like you use language creatively?,feel like use language creatively mm matter co...,clinical,"[feel, like, use, language, creatively, mm, ma...","[feel, like, use, language, creatively, my, ma...","[feel, like, use, language, creatively, my, ma...","[feel, like, use, language, creatively, my, ma..."


# 2.2 Text Vectorization 
## 2.2.1 TF-IDF and Count Vectorizer

In [33]:
def text_vectorization(tokens,technique):
    token_strings = [' '.join(t) for t in tokens]
    matrix = technique.fit_transform(token_strings)
    
    sh = matrix.shape
    print(sh[0],"rows/sentences and",sh[1],"columns/unique terms.")
    
    vectorizer_dict = dict(zip(technique.get_feature_names_out(), matrix.toarray().sum(axis=0)))
    vectorizer_dict = dict(sorted(vectorizer_dict.items(), key=lambda x: x[1], reverse=True))

    df = pd.DataFrame(matrix.toarray(), columns=technique.get_feature_names_out())
    
    return matrix,technique,df

TF-IDF represents the importance of words in a document relative to a collection of documents.
It measures the frequency of a term in a document (Term Frequency) weighted by the rarity of the term across documents (Inverse Document Frequency).
Words that appear frequently in a document but are rare in the entire corpus receive higher TF-IDF scores.
Provides a sparse representation where each document is represented by a vector indicating the importance of each term (word) in the document.

Each row corresponds to a specific text (conversation) from the dataset, and each column corresponds to a unique word (feature name) from the entire dataset. 

In [34]:
split_tokens = split_df_clean['Final_Tokens']

stfidf_vectorizer = TfidfVectorizer()
stfidf_matrix,stfidf_vectorizer, stfidf_df = text_vectorization(split_tokens, stfidf_vectorizer)

stfidf_df.to_csv(os.path.join(data_path, 'split_tfidf_matrix.csv'), index=False)

with open(os.path.join(data_path,'split_tfidf_vectorizer.pkl'), 'wb') as f:
    pickle.dump(stfidf_vectorizer, f)

1777 rows/sentences and 3535 columns/unique terms.


In [35]:
tokens = df_clean['Final_Tokens']

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix,tfidf_vectorizer, tfidf_df = text_vectorization(tokens, tfidf_vectorizer)

tfidf_df.to_csv(os.path.join(data_path, 'tfidf_matrix.csv'), index=False)

with open(os.path.join(data_path,'tfidf_vectorizer.pkl'), 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

28 rows/sentences and 3543 columns/unique terms.


Count Vectorizer converts text documents into a matrix where each row represents a document, and each column represents the count of each word in the document.
It counts the occurrences of each word in the document and constructs a term-document matrix.
Provides a sparse representation similar to TF-IDF, where each document is represented by a vector of word counts.

In [36]:
scount_vectorizer = CountVectorizer()
scount_matrix,scount_vectorizer, scount_df = text_vectorization(split_tokens, scount_vectorizer)

scount_df.to_csv(os.path.join(data_path, 'split_count_matrix.csv'), index=False)

with open(os.path.join(data_path,'split_count_vectorizer.pkl'), 'wb') as f:
    pickle.dump(scount_vectorizer, f)

1777 rows/sentences and 3535 columns/unique terms.


In [37]:
count_vectorizer = CountVectorizer()
count_matrix,count_vectorizer, count_df = text_vectorization(tokens, count_vectorizer)

count_df.to_csv(os.path.join(data_path, 'count_matrix.csv'), index=False)

with open(os.path.join(data_path,'count_vectorizer.pkl'), 'wb') as f:
    pickle.dump(count_vectorizer, f)

28 rows/sentences and 3543 columns/unique terms.


## 2.2.2 Word2Vec

Word2Vec is a type of word embedding technique that represents words as dense, low-dimensional vectors in a continuous space.
It captures semantic relationships between words by placing similar words closer together in the vector space.
It's trained on a large corpus of text to learn the word embeddings based on the context of words within sentences.
Provides a dense representation where each word is mapped to a fixed-length vector, capturing semantic and syntactic information.

While there's no direct one-to-one mapping of dimensions to specific linguistic features, the model implicitly assigns roles to these dimensions through the learning process:

* Semantic Relationships: Certain dimensions might specialize in capturing semantic relationships. Words with similar meanings or usage contexts could exhibit similar values along these dimensions. For example, words like "car" and "automobile" might have similar values along certain semantic dimensions.

* Syntactic Relationships: Other dimensions may capture syntactic patterns or grammatical relationships. These dimensions could encode information about verb-object relationships, tense, plurality, or other syntactic structures. For instance, certain dimensions might help distinguish between singular and plural forms of nouns.

* Contextual Information: Some dimensions might encode contextual information, representing the surrounding words or phrases. These dimensions could help the model understand how words are used in different contexts or how their meanings change based on surrounding words.

Word embeddings, such as those created by Word2Vec, can be thought of as points in a high-dimensional coordinate space. Each word is represented by a vector, and these vectors are positioned in a multi-dimensional space where the dimensions correspond to different learned features or aspects of the words.

*Coordinate System:* A multi-dimensional space where each dimension represents a different, abstract feature or property related to words.
*Word Representation:* Each word is represented as a point in this space, with its position determined by the values across all dimensions in its vector representation.
*Semantic Relationships:* Similar words or words used in similar contexts tend to be closer together in this space, while words with different meanings or usages are farther apart.

In [38]:
def text_vectorization_word2vec(tokens, model):
    token_vectors = []
    for sentence in tokens:
        valid_tokens = [token for token in sentence if token in model.wv]
        if valid_tokens:
            vectors = [model.wv[token] for token in valid_tokens]
            sentence_vector = np.mean(vectors, axis=0)
            token_vectors.append(sentence_vector)
        else:
            # Handle cases where all tokens in a sentence are not in the vocabulary
            token_vectors.append(np.zeros(model.vector_size))  # Fill with zeros 
    
    matrix = np.array(token_vectors)
    df = pd.DataFrame(matrix)
    return matrix, model, df

In [39]:
sword2vec_model = Word2Vec(split_tokens, vector_size=200, window=5, min_count=1, workers=4)
sword2vec_matrix, sword2vec_model, sword2vec_df = text_vectorization_word2vec(split_tokens, sword2vec_model)

sword2vec_df.to_csv(os.path.join(data_path, 'split_word2vec_df.csv'), index=False)

In [40]:
word2vec_model = Word2Vec(tokens, vector_size=200, window=5, min_count=1, workers=4)
word2vec_matrix, word2vec_model, word2vec_df = text_vectorization_word2vec(tokens, word2vec_model)

word2vec_df.to_csv(os.path.join(data_path,'word2vec_df.csv'), index=False)

## 2.2.3 Bert

* Masked Language Model (MLM): BERT masks some words in the input and learns to predict those masked words based on the context provided by the rest of the words in the sentence.
* Next Sentence Prediction (NSP): BERT learns to predict whether a pair of sentences follows each other logically in the corpus.

Transormers: A type of neural network architecture. Unlike sequential models like RNNs (Recurrent Neural Networks) that process sequences step by step, Transformers utilize attention mechanisms to process the entire sequence simultaneously in parallel, making them highly efficient for capturing long-range dependencies in data.

In [41]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def generate_bert_embeddings(text):
    #converts it into a format suitable for BERT input (in this case, PyTorch tensors with padding and truncation
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128) 
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()  
    return embeddings

split_df_clean['Bert Embeddings'] = split_df_clean['Text'].apply(lambda x: generate_bert_embeddings(x))
df_clean['Bert Embeddings'] = df_clean['Text'].apply(lambda x: generate_bert_embeddings(x))

In [42]:
sembeddings = np.array(split_df_clean['Bert Embeddings'].tolist())  # Convert to numpy array

# Reshape the embeddings to 2D
sreshaped_embeddings = sembeddings.reshape(sembeddings.shape[0], -1)

with open(os.path.join(data_path,'split_bert_embeddings.pkl'), 'wb') as f:
    pickle.dump(sreshaped_embeddings, f)

In [43]:
embeddings = np.array(df_clean['Bert Embeddings'].tolist())  # Convert to numpy array

# Reshape the embeddings to 2D
reshaped_embeddings = embeddings.reshape(embeddings.shape[0], -1)

with open(os.path.join(data_path,'bert_embeddings.pkl'), 'wb') as f:
    pickle.dump(reshaped_embeddings, f)

In [44]:
split_df_clean.to_csv(os.path.join(data_path,'split_df_clean.csv'), index=False)

In [45]:
df_clean.to_csv(os.path.join(data_path,'df_clean.csv'), index=False)

# 2.3 FINAL NOTES

**Tag çok gerekli değil**

    klasik split
    
    spell check
    
    text cümle sözcük spell check lem->vectorizer
    
    
**texti cümlelere böl**

      her cümlenin şizofreni olup olmadığını işaretle

      unitler cümle olmalı
  
**naive bayes,logistic -> explanaible 
        | neural network**