Permalink
Browse files

Added Stanford Named Entity Recognizer Samples

  • Loading branch information...
1 parent 9d4cf9e commit 6a293b228eeffa9fd69a704b1b511b3639674c00 Siarhei_Tsikhan committed Feb 15, 2013
View
@@ -4,14 +4,18 @@ The Stanford Natural Language Processing, in F#
This project contains [Stanford NLP](http://www-nlp.stanford.edu/) assemblies (compiled from *.jar files using [IKVM.NET Bytecode Compiler](http://www.ikvm.net/userguide/ikvmc.html)) with samples translated to [F#](http://fsharp.org/).
-### [Stanford Parser](http://www-nlp.stanford.edu/software/lex-parser.shtml)
+### [Stanford Parser](http://www-nlp.stanford.edu/software/lex-parser.shtml) (v2.0.4 - 2012-11-12)
Implementations of probabilistic natural language parsers, both highly optimized PCFG and dependency parsers, and a lexicalized PCFG parser in Java. Includes: [Online parser demo](http://nlp.stanford.edu:8080/parser/), [Stanford Dependencies](http://nlp.stanford.edu/software/stanford-dependencies.shtml) page, and [Parser FAQ](http://www-nlp.stanford.edu/software/parser-faq.shtml).
-### [Stanford POS Tagger](http://www-nlp.stanford.edu/software/tagger.shtml)
+### [Stanford POS Tagger](http://www-nlp.stanford.edu/software/tagger.shtml) (v3.1.4 - 2012-11-11)
A maximum-entropy (CMM) part-of-speech (POS) tagger for English, Arabic, Chinese, French, and German, in Java.
+### [Stanford Named Entity Recognizer](http://www-nlp.stanford.edu/software/CRF-NER.shtml) (v1.2.7 - 2012-11-11)
+
+A Conditional Random Field sequence model, together with well-engineered features for Named Entity Recognition in English and German. [Online NER demo](http://nlp.stanford.edu:8080/ner/)
+
----------
All libraries are distributed only with models which were used in code samples. Full model set is available on the [The Stanford Natural Language Processing Group](http://www-nlp.stanford.edu/software/index.shtml) site.
@@ -0,0 +1,52 @@
+trainFile = /u/nlp/data/ner/column_data/all.3class.train
+testFile = /u/nlp/data/ner/column_data/all.3class.test
+serializeTo = english.all.3class.distsim.crf.ser.gz
+
+type = crf
+
+#distSimLexicon = /u/nlp/data/pos_tags_are_useless/englishGigaword.200.pruned
+#distSimLexicon = /u/nlp/data/pos_tags_are_useless/egw.bnc.200
+distSimLexicon = /u/nlp/data/pos_tags_are_useless/egw4-reut.512.clusters
+useDistSim = true
+
+map = word=0,answer=1
+
+saveFeatureIndexToDisk = true
+
+useClassFeature=true
+useWord=true
+#useWordPairs=true
+useNGrams=true
+noMidNGrams=true
+maxNGramLeng=6
+usePrev=true
+useNext=true
+#useTags=true
+#useWordTag=true
+useLongSequences=true
+useSequences=true
+usePrevSequences=true
+useTypeSeqs=true
+useTypeSeqs2=true
+useTypeySequences=true
+useOccurrencePatterns=true
+useLastRealWord=true
+useNextRealWord=true
+#useReverse=false
+normalize=true
+# normalizeTimex=true
+wordShape=chris2useLC
+useDisjunctive=true
+disjunctionWidth=5
+#useDisjunctiveShapeInteraction=true
+
+maxLeft=1
+readerAndWriter=edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter
+
+useObservedSequencesOnly=true
+
+useQN = true
+QNsize = 25
+
+# makes it go faster
+featureDiffThresh=0.05
Binary file not shown.
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="utf-8"?>
+<configuration>
+ <startup>
+ <supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5" />
+ </startup>
+ <runtime>
+ <assemblyBinding xmlns="urn:schemas-microsoft-com:asm.v1">
+ <dependentAssembly>
+ <assemblyIdentity name="FSharp.Core" publicKeyToken="b03f5f7f11d50a3a" culture="neutral" />
+ <bindingRedirect oldVersion="2.0.0.0" newVersion="4.3.0.0" />
+ <bindingRedirect oldVersion="2.3.5.0" newVersion="4.3.0.0" />
+ <bindingRedirect oldVersion="4.0.0.0" newVersion="4.3.0.0" />
+ </dependentAssembly>
+ </assemblyBinding>
+ </runtime>
+</configuration>
@@ -0,0 +1,49 @@
+module NERDemo
+
+open edu.stanford.nlp.ie
+open edu.stanford.nlp.ie.crf
+open edu.stanford.nlp.io
+open edu.stanford.nlp.ling
+
+open java.util
+open System.IO
+open IKVM.FSharp
+
+
+let main file =
+ let classifier =
+ CRFClassifier.getClassifierNoExceptions(
+ @"..\..\..\..\StanfordNLPLibraries\stanford-ner\classifiers\english.all.3class.distsim.crf.ser.gz")
+ // For either a file to annotate or for the hardcoded text example,
+ // this demo file shows two ways to process the output, for teaching
+ // purposes. For the file, it shows both how to run NER on a String
+ // and how to run it on a whole file. For the hard-coded String,
+ // it shows how to run it on a single sentence, and how to do this
+ // and produce an inline XML output format.
+ let x = edu.stanford.nlp.ling.CoreAnnotations.AnswerAnnotation().getClass()
+ match file with
+ | Some(fileName) ->
+ let fileContents = File.ReadAllText(fileName)
+ classifier.classify(fileContents).iterator()
+ |> Collections.toSeq
+ |> Seq.cast<java.util.List>
+ |> Seq.iter (fun sentence ->
+ sentence.iterator()
+ |> Collections.toSeq
+ |> Seq.cast<CoreLabel>
+ |> Seq.iter (fun word ->
+ printf "%s/%O " (word.word()) (word.get(CoreAnnotations.AnswerAnnotation().getClass()))
+ )
+ printfn ""
+ )
+ | None ->
+ let s1 = "Good afternoon Rajat Raina, how are you today?"
+ let s2 = "I go to school at Stanford University, which is located in California."
+ printfn "%s\n" (classifier.classifyToString(s1))
+ printfn "%s\n" (classifier.classifyWithInlineXML(s2))
+ printfn "%s\n" (classifier.classifyToString(s2, "xml", true));
+ classifier.classify(s2).iterator()
+ |> Collections.toSeq
+ |> Seq.iteri (fun i coreLabel ->
+ printfn "%d\n:%O\n" i coreLabel
+ )
@@ -0,0 +1,12 @@
+// Learn more about F# at http://fsharp.net
+// See the 'F# Tutorial' project for more help.
+
+[<EntryPoint>]
+let main argv =
+ printfn "%A" argv
+ match argv with
+ | [|fileName|] -> NERDemo.main (Some(fileName))
+ | [||] -> NERDemo.main None
+ | _ -> failwith "Incorrect input parameters"
+
+ 0 // return an integer exit code
@@ -0,0 +1,179 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
+ <PropertyGroup>
+ <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+ <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+ <SchemaVersion>2.0</SchemaVersion>
+ <ProjectGuid>c527efc5-33ca-4317-b413-06aed2e9c65e</ProjectGuid>
+ <OutputType>Exe</OutputType>
+ <RootNamespace>StanfordNamedEntityRecognizer.Samples</RootNamespace>
+ <AssemblyName>StanfordNamedEntityRecognizer.Samples</AssemblyName>
+ <TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
+ <Name>StanfordNamedEntityRecognizer.Samples</Name>
+ <TargetFrameworkProfile />
+ <SolutionDir Condition="$(SolutionDir) == '' Or $(SolutionDir) == '*Undefined*'">..\</SolutionDir>
+ <RestorePackages>true</RestorePackages>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Debug|AnyCPU' ">
+ <DebugSymbols>true</DebugSymbols>
+ <DebugType>full</DebugType>
+ <Optimize>false</Optimize>
+ <Tailcalls>false</Tailcalls>
+ <OutputPath>bin\Debug\</OutputPath>
+ <DefineConstants>DEBUG;TRACE</DefineConstants>
+ <WarningLevel>3</WarningLevel>
+ <PlatformTarget>AnyCPU</PlatformTarget>
+ <DocumentationFile>bin\Debug\StanfordNamedEntityRecognizer.Samples.XML</DocumentationFile>
+ <Prefer32Bit>true</Prefer32Bit>
+ <StartArguments>sample.txt</StartArguments>
+ </PropertyGroup>
+ <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
+ <DebugType>pdbonly</DebugType>
+ <Optimize>true</Optimize>
+ <Tailcalls>true</Tailcalls>
+ <OutputPath>bin\Release\</OutputPath>
+ <DefineConstants>TRACE</DefineConstants>
+ <WarningLevel>3</WarningLevel>
+ <PlatformTarget>AnyCPU</PlatformTarget>
+ <DocumentationFile>bin\Release\StanfordNamedEntityRecognizer.Samples.XML</DocumentationFile>
+ <Prefer32Bit>true</Prefer32Bit>
+ </PropertyGroup>
+ <PropertyGroup>
+ <MinimumVisualStudioVersion Condition="'$(MinimumVisualStudioVersion)' == ''">11</MinimumVisualStudioVersion>
+ </PropertyGroup>
+ <Import Project="$(MSBuildExtensionsPath32)\..\Microsoft SDKs\F#\3.0\Framework\v4.0\Microsoft.FSharp.Targets" Condition=" Exists('$(MSBuildExtensionsPath32)\..\Microsoft SDKs\F#\3.0\Framework\v4.0\Microsoft.FSharp.Targets')" />
+ <Import Project="$(SolutionDir)\.nuget\nuget.targets" />
+ <ItemGroup>
+ <Compile Include="NERDemo.fs" />
+ <Compile Include="Program.fs" />
+ <None Include="App.config" />
+ <None Include="packages.config" />
+ <Content Include="sample.txt">
+ <CopyToOutputDirectory>Always</CopyToOutputDirectory>
+ </Content>
+ </ItemGroup>
+ <ItemGroup>
+ <Reference Include="FSharp.Core, Version=4.3.0.0, Culture=neutral, PublicKeyToken=b03f5f7f11d50a3a">
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.AWT.WinForms">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.AWT.WinForms.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Beans">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Beans.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Charsets">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Charsets.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Corba">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Corba.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Core">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Core.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Jdbc">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Jdbc.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Management">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Management.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Media">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Media.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Misc">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Misc.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Naming">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Naming.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Remoting">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Remoting.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Security">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Security.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.SwingAWT">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.SwingAWT.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Text">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Text.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Tools">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Tools.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.Util">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.Util.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.XML.API">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.XML.API.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.XML.Bind">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.XML.Bind.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.XML.Crypto">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.XML.Crypto.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.XML.Parse">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.XML.Parse.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.XML.Transform">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.XML.Transform.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.XML.WebServices">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.XML.WebServices.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.OpenJDK.XML.XPath">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.OpenJDK.XML.XPath.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.Runtime">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.Runtime.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="IKVM.Runtime.JNI">
+ <HintPath>..\packages\IKVM.7.2.4630.5\lib\IKVM.Runtime.JNI.dll</HintPath>
+ <Private>True</Private>
+ </Reference>
+ <Reference Include="mscorlib" />
+ <Reference Include="stanford-ner">
+ <HintPath>..\..\StanfordNLPLibraries\stanford-ner\stanford-ner.dll</HintPath>
+ </Reference>
+ <Reference Include="System" />
+ <Reference Include="System.Core" />
+ <ProjectReference Include="..\IKVM.FSharp\IKVM.FSharp.fsproj">
+ <Name>IKVM.FSharp</Name>
+ <Project>{7a3d61f6-dca1-4636-8005-8f6da24a8479}</Project>
+ <Private>True</Private>
+ </ProjectReference>
+ </ItemGroup>
+ <!-- To modify your build process, add your task inside one of the targets below and uncomment it.
+ Other similar extension points exist, see Microsoft.Common.targets.
+ <Target Name="BeforeBuild">
+ </Target>
+ <Target Name="AfterBuild">
+ </Target>
+ -->
+</Project>
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<packages>
+ <package id="IKVM" version="7.2.4630.5" targetFramework="net45" />
+</packages>
@@ -0,0 +1,4 @@
+Don Syme is an Australian computer scientist and a Principal Researcher at Microsoft Research, Cambridge, U.K. He is the designer and architect of the F# programming language, described by a reporter as being regarded as "the most original new face in computer languages since Bjarne Stroustrup developed C++ in the early 1980s.".
+Earlier, Syme created generics in the .NET Common Language Runtime, including the initial design of generics for the C# programming language, along with others including Andrew Kennedy and later Anders Hejlsberg. Kennedy, Syme and Yu also formalized this widely used system.
+He holds a Ph.D. from the University of Cambridge, and is a member of the WG2.8 working group on functional programming. He is a co-author of the book Expert F# 2.0.
+In the past he also worked on formal specification, interactive proof, automated verification and proof description languages.
@@ -14,6 +14,8 @@ Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "StanfordPOSTagger.Samples",
EndProject
Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "IKVM.FSharp", "IKVM.FSharp\IKVM.FSharp.fsproj", "{7A3D61F6-DCA1-4636-8005-8F6DA24A8479}"
EndProject
+Project("{F2A71F9B-5D33-465A-A702-920D77279786}") = "StanfordNamedEntityRecognizer.Samples", "StanfordNamedEntityRecognizer.Samples\StanfordNamedEntityRecognizer.Samples.fsproj", "{C527EFC5-33CA-4317-B413-06AED2E9C65E}"
+EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
@@ -32,6 +34,10 @@ Global
{7A3D61F6-DCA1-4636-8005-8F6DA24A8479}.Debug|Any CPU.Build.0 = Debug|Any CPU
{7A3D61F6-DCA1-4636-8005-8F6DA24A8479}.Release|Any CPU.ActiveCfg = Release|Any CPU
{7A3D61F6-DCA1-4636-8005-8F6DA24A8479}.Release|Any CPU.Build.0 = Release|Any CPU
+ {C527EFC5-33CA-4317-B413-06AED2E9C65E}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+ {C527EFC5-33CA-4317-B413-06AED2E9C65E}.Debug|Any CPU.Build.0 = Debug|Any CPU
+ {C527EFC5-33CA-4317-B413-06AED2E9C65E}.Release|Any CPU.ActiveCfg = Release|Any CPU
+ {C527EFC5-33CA-4317-B413-06AED2E9C65E}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
@@ -1,6 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<repositories>
<repository path="..\IKVM.FSharp\packages.config" />
+ <repository path="..\StanfordNamedEntityRecognizer.Samples\packages.config" />
<repository path="..\StanfordParser.Samples\packages.config" />
<repository path="..\StanfordPOSTagger.Samples\packages.config" />
</repositories>

0 comments on commit 6a293b2

Please sign in to comment.