Permalink
Browse files

add stemming algorithm, not well tested

  • Loading branch information...
ptmt committed Mar 25, 2012
1 parent 373750e commit 3b9bf068673492cd3431a72de1f37aeb0745aaad
View
@@ -1,9 +1,9 @@
module Samples
//open TinyNLP.Tokenizer
-open TinyNLP.Synonyms
+open TinyNLP.Synonymizer
-let input_string = "Я хочу сказать что-нибудь о Рио-де-Жанейро и о г.Бердске"
+let input_string = "Я хочу сказать что-нибудь прекрасное и возвышенное как о Рио-де-Жанейро так и о г. Бердске"
let duration f =
let timer = new System.Diagnostics.Stopwatch()
@@ -12,13 +12,17 @@ let duration f =
printfn "%A Ellapsed Time: %f ms" (f.GetType().ToString()) timer.Elapsed.TotalMilliseconds
returnValue
-
+let stemSample input =
+ let r = TinyNLP.Tokenizer.tokenize input_string
+ match r with
+ | None -> ["Error"]
+ | _ -> r.Value |> List.map (fun x -> TinyNLP.Stemming.Stem x)
let sampleTinyNLPSynonyms =
let word1 = "слово"
let word2 = "дело"
- printfn "synonyms for %A is %A" word1 (getSynonyms word1)
- printfn "synonyms for %A is %A" word2 (getSynonyms word2)
+ printfn "synonyms for %A is %A" word1 (getSynonyms (TinyNLP.Stemming.Stem word1))
+ printfn "synonyms for %A is %A" word2 (getSynonyms (TinyNLP.Stemming.Stem word2))
let tokenizeSample =
let r = TinyNLP.Tokenizer.tokenize input_string
@@ -28,4 +32,6 @@ let tokenizeSample =
duration (fun () -> sampleTinyNLPSynonyms) |> ignore
-printfn "%A" (duration (fun () -> tokenizeSample))
+printfn "token %A" (duration (fun () -> tokenizeSample))
+
+printfn "stem %A" (duration (fun () -> stemSample input_string))

Some generated files are not rendered by default. Learn more.

Oops, something went wrong.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

Some generated files are not rendered by default. Learn more.

Oops, something went wrong.
Binary file not shown.
Binary file not shown.
View
@@ -0,0 +1,98 @@
+module TinyNLP.Stemming
+
+open System;
+open System.Collections.Generic;
+open System.Linq;
+open System.Text;
+open System.Text.RegularExpressions;
+
+let c_vower = "аеиоуыэюя"
+let c_perfectiveground = "((ив|ивши|ившись|ыв|ывши|ывшись)|((?<=[ая])(в|вши|вшись)))$"
+let c_reflexive = "(с[яь])$"
+let c_adjective = "(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|еых|ую|юю|ая|яя|ою|ею)$"
+let c_participle = "((ивш|ывш|ующ)|((?<=[ая])(ем|нн|вш|ющ|щ)))$"
+let c_verb = "((ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ены|ить|ыть|ишь|ую|ю)|((?<=[ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)))$"
+let c_noun = "(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|и|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$"
+let c_rvre = "^(.*?[аеиоуыэюя])(.*)$"
+let c_derivational = "[^аеиоуыэюя][аеиоуыэюя]+[^аеиоуыэюя]+[аеиоуыэюя].*(?<=о)сть?$"
+let c_eng = "[a-z0-9]"
+let c_i = "и$"
+let c_ost = "ость?$"
+let empty_string = String.Empty
+
+let RegexReplace (original:string, regx:string, value:string) =
+ let reg = new Regex(regx)
+ let n = reg.Replace(original, value)
+ (original.Equals(n), n)
+
+let RegexMatch (original:string, regx:string) =
+ let reg = new Regex(regx)
+ reg.Match(original)
+
+let RegexMatches (original:string, regx:string) =
+ let reg = new Regex(regx, RegexOptions.Multiline);
+ reg.Matches(original)
+
+let inline isEnd word =
+ let matches = RegexMatches(word, c_rvre)
+ matches.Count < 1
+
+let step1 word =
+ let a = RegexReplace (word, c_perfectiveground, empty_string)
+ match a with
+ | (false, w) -> w
+ | (true, w) ->
+ let b1 = RegexReplace (w, c_reflexive, empty_string)
+ let b2 = RegexReplace(snd(b1), c_adjective, empty_string)
+ match b2 with
+ | (false, w1) -> snd(RegexReplace(w1, c_participle, empty_string))
+ | (true, w1) ->
+ match RegexReplace(w1, c_verb, empty_string) with
+ | (true, w2) -> snd(RegexReplace(w2, c_noun, empty_string))
+ | (false, w2) -> w2
+
+let step2 word =
+ snd(RegexReplace(word, c_i, empty_string))
+
+let step3 word =
+ let m = RegexMatch(word, c_derivational)
+ match m.Success with
+ | true -> snd(RegexReplace(word, c_ost, empty_string))
+ | _ -> word
+
+let step4 word =
+ let m = RegexReplace(word, "ь$", empty_string)
+ match m with
+ | (false, w) -> w
+ | (true, w) -> snd(RegexReplace(snd(RegexReplace(w, "ейше?", empty_string)), "нн$", "н"))
+
+let Stem (inword:string) =
+ //let lowword = word.ToLower().Trim().Replace("ё", "е")
+ let word = inword.ToLower().Trim().Replace("ё", "е");
+ // let rec instem word =
+ // printfn "%A" word
+ match isEnd word with
+ | true -> word
+ | false -> (RegexMatches(word, c_rvre).Item 0).Value |> step1 |> step2 |> step3 |> step4 //|> instem
+ // instem lowword
+
+
+
+//
+//
+// // шаг 4
+// if (!RegexReplace(ref rv, "ь$", string.Empty))
+// {
+// RegexReplace(ref rv, "ейше?", string.Empty);
+// RegexReplace(ref rv, "нн$", "н");
+// }
+//
+// value = rv;
+//
+// } while (false);
+//
+// return value;
+// }
+//
+// }
+//}
View
@@ -0,0 +1,40 @@
+module TinyNLP.Synonymizer
+
+open ProtoBuf
+open Kevo.Store
+
+type LexicalClass =
+ | Noun = 1 // существительное
+ | Verb = 2 // глагол
+ | Adverb = 3 // наречие
+ | Adjective = 4 // прилагательное
+ | Prepositions = 5 // предлоги
+ | Others = 10
+
+let empty_string = ""
+
+[<ProtoContract(ImplicitFields = ImplicitFields.AllPublic)>]
+type WordItem (word : string, wordst : string, suff : string, part : LexicalClass, syn : int array, prefix : string) = class
+ member val Word : string = word with get, set
+ member val Wordst : string = wordst with get, set
+ member val Suff : string = suff with get, set
+ member val Part : LexicalClass = part with get, set
+ member val Syn : int array = syn with get, set
+ member val Prefix : string = prefix with get, set
+ new() = WordItem(empty_string, empty_string, empty_string, LexicalClass.Others, [||], empty_string)
+ override x.ToString() = x.Word
+ end
+
+
+
+ let getSynonyms for_word =
+ let query (x:WordItem) =
+ x.Wordst = for_word
+ let getWord id =
+ let witem = Kevo.Store.findById<WordItem> id
+ match witem with
+ | None -> empty_string
+ | _ -> witem.Value.Word
+ let getRelations (word_ids: int array) =
+ word_ids |> Array.map (fun x -> getWord x) |> List.ofArray
+ Kevo.Store.findByQuery<WordItem> query |> List.collect (fun x -> getRelations x.Syn)
View
@@ -55,7 +55,8 @@
<ItemGroup>
<Compile Include="NLPCore.fs" />
<Compile Include="Tokenizer.fs" />
- <Compile Include="Synonyms.fs" />
+ <Compile Include="Stemming.fs" />
+ <Compile Include="Synonymizer.fs" />
<None Include="packages.config" />
</ItemGroup>
<PropertyGroup>

0 comments on commit 3b9bf06

Please sign in to comment.